mlrun 1.10.0rc25__py3-none-any.whl → 1.10.0rc27__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (44) hide show
  1. mlrun/artifacts/llm_prompt.py +8 -1
  2. mlrun/common/model_monitoring/helpers.py +86 -0
  3. mlrun/common/schemas/hub.py +11 -18
  4. mlrun/config.py +2 -3
  5. mlrun/datastore/__init__.py +2 -2
  6. mlrun/datastore/datastore_profile.py +27 -3
  7. mlrun/datastore/model_provider/huggingface_provider.py +5 -1
  8. mlrun/datastore/model_provider/model_provider.py +1 -1
  9. mlrun/datastore/s3.py +24 -2
  10. mlrun/datastore/storeytargets.py +2 -3
  11. mlrun/db/base.py +14 -0
  12. mlrun/db/httpdb.py +11 -2
  13. mlrun/db/nopdb.py +13 -0
  14. mlrun/k8s_utils.py +0 -14
  15. mlrun/model_monitoring/applications/base.py +20 -3
  16. mlrun/model_monitoring/controller.py +5 -3
  17. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +3 -1
  18. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
  19. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +3 -0
  20. mlrun/model_monitoring/helpers.py +5 -5
  21. mlrun/projects/pipelines.py +2 -2
  22. mlrun/projects/project.py +5 -5
  23. mlrun/run.py +12 -1
  24. mlrun/runtimes/base.py +0 -3
  25. mlrun/runtimes/mounts.py +15 -2
  26. mlrun/runtimes/nuclio/function.py +35 -26
  27. mlrun/runtimes/pod.py +153 -11
  28. mlrun/serving/routers.py +23 -41
  29. mlrun/serving/server.py +1 -0
  30. mlrun/serving/states.py +3 -3
  31. mlrun/serving/system_steps.py +52 -29
  32. mlrun/serving/v2_serving.py +9 -10
  33. mlrun/utils/helpers.py +10 -13
  34. mlrun/utils/notifications/notification/base.py +18 -0
  35. mlrun/utils/notifications/notification/git.py +2 -4
  36. mlrun/utils/notifications/notification/slack.py +2 -4
  37. mlrun/utils/notifications/notification/webhook.py +2 -5
  38. mlrun/utils/version/version.json +2 -2
  39. {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc27.dist-info}/METADATA +22 -26
  40. {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc27.dist-info}/RECORD +44 -44
  41. {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc27.dist-info}/WHEEL +0 -0
  42. {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc27.dist-info}/entry_points.txt +0 -0
  43. {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc27.dist-info}/licenses/LICENSE +0 -0
  44. {mlrun-1.10.0rc25.dist-info → mlrun-1.10.0rc27.dist-info}/top_level.txt +0 -0
@@ -25,10 +25,12 @@ from mlrun.utils import logger
25
25
 
26
26
  def _normalize_dict_for_v3io_frames(event: dict[str, Any]) -> dict[str, Any]:
27
27
  """
28
- Normalize user defined keys - input data to a model and its predictions,
29
- to a form V3IO frames tolerates.
28
+ Normalize user-defined keys (e.g., model input data and predictions) to a format V3IO Frames tolerates.
30
29
 
31
- The dictionary keys should conform to '^[a-zA-Z_:]([a-zA-Z0-9_:])*$'.
30
+ - Keys must match regex: '^[a-zA-Z_:]([a-zA-Z0-9_:])*$'
31
+ - Replace invalid characters (e.g., '-') with '_'.
32
+ - Prefix keys starting with digits with '_'.
33
+ - Flatten nested dictionaries using dot notation, while normalizing keys recursively.
32
34
  """
33
35
  prefix = "_"
34
36
 
@@ -38,7 +40,18 @@ def _normalize_dict_for_v3io_frames(event: dict[str, Any]) -> dict[str, Any]:
38
40
  return prefix + key
39
41
  return key
40
42
 
41
- return {norm_key(k): v for k, v in event.items()}
43
+ def flatten_dict(d: dict[str, Any], parent_key: str = "") -> dict[str, Any]:
44
+ items = {}
45
+ for k, v in d.items():
46
+ new_key = norm_key(k)
47
+ full_key = f"{parent_key}.{new_key}" if parent_key else new_key
48
+ if isinstance(v, dict):
49
+ items.update(flatten_dict(v, full_key))
50
+ else:
51
+ items[full_key] = v
52
+ return items
53
+
54
+ return flatten_dict(event)
42
55
 
43
56
 
44
57
  class ProcessBeforeTSDB(mlrun.feature_store.steps.MapClass):
@@ -973,6 +973,9 @@ class V3IOTSDBConnector(TSDBConnector):
973
973
  start: Optional[datetime] = None,
974
974
  end: Optional[datetime] = None,
975
975
  ) -> dict[str, float]:
976
+ if not endpoint_ids:
977
+ return {}
978
+
976
979
  # Get the last request timestamp for each endpoint from the KV table.
977
980
  # The result of the query is a list of dictionaries,
978
981
  # each dictionary contains the endpoint id and the last request timestamp.
@@ -143,7 +143,7 @@ def get_stream_path(
143
143
  return stream_uri.replace("v3io://", f"ds://{profile.name}")
144
144
 
145
145
  elif isinstance(
146
- profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource
146
+ profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream
147
147
  ):
148
148
  topic = mlrun.common.model_monitoring.helpers.get_kafka_topic(
149
149
  project=project, function_name=function_name
@@ -152,7 +152,7 @@ def get_stream_path(
152
152
  else:
153
153
  raise mlrun.errors.MLRunValueError(
154
154
  f"Received an unexpected stream profile type: {type(profile)}\n"
155
- "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaSource`."
155
+ "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
156
156
  )
157
157
 
158
158
 
@@ -300,7 +300,7 @@ def _get_v3io_output_stream(
300
300
 
301
301
  def _get_kafka_output_stream(
302
302
  *,
303
- kafka_profile: mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource,
303
+ kafka_profile: mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream,
304
304
  project: str,
305
305
  function_name: str,
306
306
  mock: bool = False,
@@ -356,7 +356,7 @@ def get_output_stream(
356
356
  )
357
357
 
358
358
  elif isinstance(
359
- profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource
359
+ profile, mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream
360
360
  ):
361
361
  return _get_kafka_output_stream(
362
362
  kafka_profile=profile,
@@ -368,7 +368,7 @@ def get_output_stream(
368
368
  else:
369
369
  raise mlrun.errors.MLRunValueError(
370
370
  f"Received an unexpected stream profile type: {type(profile)}\n"
371
- "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaSource`."
371
+ "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
372
372
  )
373
373
 
374
374
 
@@ -228,11 +228,11 @@ class _PipelineContext:
228
228
  force_run_local = mlrun.mlconf.force_run_local
229
229
  if force_run_local is None or force_run_local == "auto":
230
230
  force_run_local = not mlrun.mlconf.is_api_running_on_k8s()
231
+
232
+ if self.workflow:
231
233
  if not mlrun.mlconf.kfp_url:
232
234
  logger.debug("Kubeflow pipeline URL is not set, running locally")
233
235
  force_run_local = True
234
-
235
- if self.workflow:
236
236
  force_run_local = force_run_local or self.workflow.run_local
237
237
 
238
238
  return force_run_local
mlrun/projects/project.py CHANGED
@@ -3816,7 +3816,7 @@ class MlrunProject(ModelObj):
3816
3816
 
3817
3817
  import mlrun
3818
3818
  from mlrun.datastore.datastore_profile import (
3819
- DatastoreProfileKafkaSource,
3819
+ DatastoreProfileKafkaStream,
3820
3820
  DatastoreProfileTDEngine,
3821
3821
  )
3822
3822
 
@@ -3833,7 +3833,7 @@ class MlrunProject(ModelObj):
3833
3833
  project.register_datastore_profile(tsdb_profile)
3834
3834
 
3835
3835
  # Create and register stream profile
3836
- stream_profile = DatastoreProfileKafkaSource(
3836
+ stream_profile = DatastoreProfileKafkaStream(
3837
3837
  name="my-kafka",
3838
3838
  brokers=["<kafka-broker-ip-address>:9094"],
3839
3839
  topics=[], # Keep the topics list empty
@@ -3875,9 +3875,9 @@ class MlrunProject(ModelObj):
3875
3875
 
3876
3876
  .. code-block:: python
3877
3877
 
3878
- from mlrun.datastore.datastore_profile import DatastoreProfileKafkaSource
3878
+ from mlrun.datastore.datastore_profile import DatastoreProfileKafkaStream
3879
3879
 
3880
- stream_profile = DatastoreProfileKafkaSource(
3880
+ stream_profile = DatastoreProfileKafkaStream(
3881
3881
  name="confluent-kafka",
3882
3882
  brokers=["<server-domain-start>.confluent.cloud:9092"],
3883
3883
  topics=[],
@@ -3906,7 +3906,7 @@ class MlrunProject(ModelObj):
3906
3906
  The supported profiles are:
3907
3907
 
3908
3908
  * :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileV3io`
3909
- * :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource`
3909
+ * :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream`
3910
3910
 
3911
3911
  You need to register one of them, and pass the profile's name.
3912
3912
  :param replace_creds: If ``True`` - override the existing credentials.
mlrun/run.py CHANGED
@@ -222,7 +222,8 @@ def get_or_create_ctx(
222
222
  :param spec: dictionary holding run spec
223
223
  :param with_env: look for context in environment vars, default True
224
224
  :param rundb: path/url to the metadata and artifact database
225
- :param project: project to initiate the context in (by default `mlrun.mlconf.active_project`)
225
+ :param project: project to initiate the context in (by default `mlrun.mlconf.active_project`).
226
+ If not set, an active project must exist.
226
227
  :param upload_artifacts: when using local context (not as part of a job/run), upload artifacts to the
227
228
  system default artifact path location
228
229
  :return: execution context
@@ -277,6 +278,16 @@ def get_or_create_ctx(
277
278
  if newspec and not isinstance(newspec, dict):
278
279
  newspec = json.loads(newspec)
279
280
 
281
+ if (
282
+ not newspec.get("metadata", {}).get("project")
283
+ and not project
284
+ and not mlconf.active_project
285
+ ):
286
+ raise mlrun.errors.MLRunMissingProjectError(
287
+ """No active project found. Make sure to set an active project using: mlrun.get_or_create_project()
288
+ You can verify the active project with: mlrun.mlconf.active_project"""
289
+ )
290
+
280
291
  if not newspec:
281
292
  newspec = {}
282
293
  if upload_artifacts:
mlrun/runtimes/base.py CHANGED
@@ -142,9 +142,6 @@ class FunctionSpec(ModelObj):
142
142
  def build(self, build):
143
143
  self._build = self._verify_dict(build, "build", ImageBuilder)
144
144
 
145
- def enrich_function_preemption_spec(self):
146
- pass
147
-
148
145
  def validate_service_account(self, allowed_service_accounts):
149
146
  pass
150
147
 
mlrun/runtimes/mounts.py CHANGED
@@ -14,6 +14,7 @@
14
14
 
15
15
  import os
16
16
  import typing
17
+ import warnings
17
18
  from collections import namedtuple
18
19
 
19
20
  from mlrun.config import config
@@ -247,10 +248,22 @@ def mount_s3(
247
248
  def _use_s3_cred(runtime: "KubeResource"):
248
249
  _access_key = aws_access_key or os.environ.get(prefix + "AWS_ACCESS_KEY_ID")
249
250
  _secret_key = aws_secret_key or os.environ.get(prefix + "AWS_SECRET_ACCESS_KEY")
250
- _endpoint_url = endpoint_url or os.environ.get(prefix + "S3_ENDPOINT_URL")
251
+
252
+ # Check for endpoint URL with backward compatibility
253
+ _endpoint_url = endpoint_url or os.environ.get(prefix + "AWS_ENDPOINT_URL_S3")
254
+ if not _endpoint_url:
255
+ # Check for deprecated environment variable
256
+ _endpoint_url = os.environ.get(prefix + "S3_ENDPOINT_URL")
257
+ if _endpoint_url:
258
+ warnings.warn(
259
+ "S3_ENDPOINT_URL is deprecated in 1.10.0 and will be removed in 1.12.0, "
260
+ "use AWS_ENDPOINT_URL_S3 instead.",
261
+ # TODO: Remove this in 1.12.0
262
+ FutureWarning,
263
+ )
251
264
 
252
265
  if _endpoint_url:
253
- runtime.set_env(prefix + "S3_ENDPOINT_URL", _endpoint_url)
266
+ runtime.set_env(prefix + "AWS_ENDPOINT_URL_S3", _endpoint_url)
254
267
  if aws_region:
255
268
  runtime.set_env(prefix + "AWS_REGION", aws_region)
256
269
  if non_anonymous:
@@ -968,24 +968,6 @@ class RemoteRuntime(KubeResource):
968
968
  self._mock_server = None
969
969
 
970
970
  if "://" not in path:
971
- if not self.status.address:
972
- # here we check that if default http trigger is disabled, function contains a custom http trigger
973
- # Otherwise, the function is not invokable, so we raise an error
974
- if (
975
- not self._trigger_of_kind_exists(kind="http")
976
- and self.spec.disable_default_http_trigger
977
- ):
978
- raise mlrun.errors.MLRunPreconditionFailedError(
979
- "Default http trigger creation is disabled and there is no any other custom http trigger, "
980
- "so function can not be invoked via http. Either enable default http trigger creation or "
981
- "create custom http trigger"
982
- )
983
- state, _, _ = self._get_state()
984
- if state not in ["ready", "scaledToZero"]:
985
- logger.warning(f"Function is in the {state} state")
986
- if not self.status.address:
987
- raise ValueError("no function address first run .deploy()")
988
-
989
971
  path = self._resolve_invocation_url(path, force_external_address)
990
972
 
991
973
  if headers is None:
@@ -1228,19 +1210,47 @@ class RemoteRuntime(KubeResource):
1228
1210
  # internal / external invocation urls is a nuclio >= 1.6.x feature
1229
1211
  # try to infer the invocation url from the internal and if not exists, use external.
1230
1212
  # $$$$ we do not want to use the external invocation url (e.g.: ingress, nodePort, etc.)
1213
+
1214
+ # check function state before invocation
1215
+ state, _, _ = self._get_state()
1216
+ if state not in ["ready", "scaledToZero"]:
1217
+ logger.warning(f"Function is in the {state} state")
1218
+
1219
+ # prefer internal invocation url if running inside k8s cluster
1231
1220
  if (
1232
1221
  not force_external_address
1233
1222
  and self.status.internal_invocation_urls
1234
1223
  and mlrun.k8s_utils.is_running_inside_kubernetes_cluster()
1235
1224
  ):
1236
- return mlrun.utils.helpers.join_urls(
1225
+ url = mlrun.utils.helpers.join_urls(
1237
1226
  f"http://{self.status.internal_invocation_urls[0]}", path
1238
1227
  )
1228
+ logger.debug(
1229
+ f"Using internal invocation url {url}. Make sure you have network access to the k8s cluster. "
1230
+ f"Otherwise, set force_external_address to True"
1231
+ )
1232
+ return url
1239
1233
 
1240
1234
  if self.status.external_invocation_urls:
1241
1235
  return mlrun.utils.helpers.join_urls(
1242
1236
  f"http://{self.status.external_invocation_urls[0]}", path
1243
1237
  )
1238
+
1239
+ if not self.status.address:
1240
+ # if there is no address
1241
+ # here we check that if default http trigger is disabled, function contains a custom http trigger
1242
+ # Otherwise, the function is not invokable, so we raise an error
1243
+ if (
1244
+ not self._trigger_of_kind_exists(kind="http")
1245
+ and self.spec.disable_default_http_trigger
1246
+ ):
1247
+ raise mlrun.errors.MLRunPreconditionFailedError(
1248
+ "Default http trigger creation is disabled and there is no any other custom http trigger, "
1249
+ "so function can not be invoked via http. Either enable default http trigger creation or "
1250
+ "create custom http trigger"
1251
+ )
1252
+ else:
1253
+ raise ValueError("no function address first run .deploy()")
1244
1254
  else:
1245
1255
  return mlrun.utils.helpers.join_urls(f"http://{self.status.address}", path)
1246
1256
 
@@ -1294,6 +1304,8 @@ class RemoteRuntime(KubeResource):
1294
1304
  def get_url(
1295
1305
  self,
1296
1306
  force_external_address: bool = False,
1307
+ # leaving auth_info for BC
1308
+ # TODO: remove in 1.12.0
1297
1309
  auth_info: AuthInfo = None,
1298
1310
  ):
1299
1311
  """
@@ -1304,13 +1316,10 @@ class RemoteRuntime(KubeResource):
1304
1316
 
1305
1317
  :return: returns function's url
1306
1318
  """
1307
- if not self.status.address:
1308
- state, _, _ = self._get_state(auth_info=auth_info)
1309
- if state != "ready" or not self.status.address:
1310
- raise ValueError(
1311
- "no function address or not ready, first run .deploy()"
1312
- )
1313
-
1319
+ if auth_info:
1320
+ logger.warning(
1321
+ "Deprecated parameter 'auth_info' was provided, but will be ignored. Will be removed in 1.12.0."
1322
+ )
1314
1323
  return self._resolve_invocation_url("", force_external_address)
1315
1324
 
1316
1325
  @staticmethod
mlrun/runtimes/pod.py CHANGED
@@ -17,6 +17,7 @@ import os
17
17
  import re
18
18
  import time
19
19
  import typing
20
+ import warnings
20
21
  from collections.abc import Iterable
21
22
  from enum import Enum
22
23
 
@@ -35,6 +36,7 @@ from mlrun.common.schemas import (
35
36
 
36
37
  from ..config import config as mlconf
37
38
  from ..k8s_utils import (
39
+ generate_preemptible_nodes_affinity_terms,
38
40
  validate_node_selectors,
39
41
  )
40
42
  from ..utils import logger, update_in
@@ -874,6 +876,133 @@ class KubeResource(BaseRuntime):
874
876
  """
875
877
  self.spec.with_requests(mem, cpu, patch=patch)
876
878
 
879
+ @staticmethod
880
+ def detect_preemptible_node_selector(node_selector: dict[str, str]) -> list[str]:
881
+ """
882
+ Check whether any provided node selector matches preemptible selectors.
883
+
884
+ :param node_selector: User-provided node selector mapping.
885
+ :return: List of `"key='value'"` strings that match a preemptible selector.
886
+ """
887
+ preemptible_node_selector = mlconf.get_preemptible_node_selector()
888
+
889
+ return [
890
+ f"'{key}': '{val}'"
891
+ for key, val in node_selector.items()
892
+ if preemptible_node_selector.get(key) == val
893
+ ]
894
+
895
+ def detect_preemptible_tolerations(
896
+ self, tolerations: list[k8s_client.V1Toleration]
897
+ ) -> list[str]:
898
+ """
899
+ Check whether any provided toleration matches preemptible tolerations.
900
+
901
+ :param tolerations: User-provided tolerations.
902
+ :return: List of formatted toleration strings that are considered preemptible.
903
+ """
904
+ preemptible_tolerations = [
905
+ k8s_client.V1Toleration(
906
+ key=toleration.get("key"),
907
+ value=toleration.get("value"),
908
+ effect=toleration.get("effect"),
909
+ )
910
+ for toleration in mlconf.get_preemptible_tolerations()
911
+ ]
912
+
913
+ def _format_toleration(toleration):
914
+ return f"'{toleration.key}'='{toleration.value}' (effect: '{toleration.effect}')"
915
+
916
+ return [
917
+ _format_toleration(toleration)
918
+ for toleration in tolerations
919
+ if toleration in preemptible_tolerations
920
+ ]
921
+
922
+ def detect_preemptible_affinity(self, affinity: k8s_client.V1Affinity) -> list[str]:
923
+ """
924
+ Check whether any provided affinity rules match preemptible affinity configs.
925
+
926
+ :param affinity: User-provided affinity object.
927
+ :return: List of formatted expressions that overlap with preemptible terms.
928
+ """
929
+ preemptible_affinity_terms = generate_preemptible_nodes_affinity_terms()
930
+ conflicting_affinities = []
931
+
932
+ if (
933
+ affinity
934
+ and affinity.node_affinity
935
+ and affinity.node_affinity.required_during_scheduling_ignored_during_execution
936
+ ):
937
+ user_terms = affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms
938
+ for user_term in user_terms:
939
+ user_expressions = {
940
+ (expr.key, expr.operator, tuple(expr.values or []))
941
+ for expr in user_term.match_expressions or []
942
+ }
943
+
944
+ for preemptible_term in preemptible_affinity_terms:
945
+ preemptible_expressions = {
946
+ (expr.key, expr.operator, tuple(expr.values or []))
947
+ for expr in preemptible_term.match_expressions or []
948
+ }
949
+
950
+ # Ensure operators match and preemptible expressions are present
951
+ common_exprs = user_expressions & preemptible_expressions
952
+ if common_exprs:
953
+ formatted = ", ".join(
954
+ f"'{key} {operator} {list(values)}'"
955
+ for key, operator, values in common_exprs
956
+ )
957
+ conflicting_affinities.append(formatted)
958
+ return conflicting_affinities
959
+
960
+ def raise_preemptible_warning(
961
+ self,
962
+ node_selector: typing.Optional[dict[str, str]],
963
+ tolerations: typing.Optional[list[k8s_client.V1Toleration]],
964
+ affinity: typing.Optional[k8s_client.V1Affinity],
965
+ ) -> None:
966
+ """
967
+ Detect conflicts and emit a single consolidated warning if needed.
968
+
969
+ :param node_selector: User-provided node selector.
970
+ :param tolerations: User-provided tolerations.
971
+ :param affinity: User-provided affinity.
972
+ :warns: PreemptionWarning - Emitted when any of the provided selectors,
973
+ tolerations, or affinity terms match the configured preemptible
974
+ settings. The message lists the conflicting items.
975
+ """
976
+ conflict_messages = []
977
+
978
+ if node_selector:
979
+ ns_conflicts = ", ".join(
980
+ self.detect_preemptible_node_selector(node_selector)
981
+ )
982
+ if ns_conflicts:
983
+ conflict_messages.append(f"Node selectors: {ns_conflicts}")
984
+
985
+ if tolerations:
986
+ tol_conflicts = ", ".join(self.detect_preemptible_tolerations(tolerations))
987
+ if tol_conflicts:
988
+ conflict_messages.append(f"Tolerations: {tol_conflicts}")
989
+
990
+ if affinity:
991
+ affinity_conflicts = ", ".join(self.detect_preemptible_affinity(affinity))
992
+ if affinity_conflicts:
993
+ conflict_messages.append(f"Affinity: {affinity_conflicts}")
994
+
995
+ if conflict_messages:
996
+ warning_componentes = "; \n".join(conflict_messages)
997
+ warnings.warn(
998
+ f"Warning: based on MLRun's preemptible node configuration, the following components \n"
999
+ f"may be removed or adjusted at runtime:\n"
1000
+ f"{warning_componentes}.\n"
1001
+ "This adjustment depends on the function's preemption mode. \n"
1002
+ "The list of potential adjusted preemptible selectors can be viewed here: "
1003
+ "mlrun.mlconf.get_preemptible_node_selector() and mlrun.mlconf.get_preemptible_tolerations()."
1004
+ )
1005
+
877
1006
  def with_node_selection(
878
1007
  self,
879
1008
  node_name: typing.Optional[str] = None,
@@ -882,18 +1011,26 @@ class KubeResource(BaseRuntime):
882
1011
  tolerations: typing.Optional[list[k8s_client.V1Toleration]] = None,
883
1012
  ):
884
1013
  """
885
- Enables to control on which k8s node the job will run
886
-
887
- :param node_name: The name of the k8s node
888
- :param node_selector: Label selector, only nodes with matching labels will be eligible to be picked
889
- :param affinity: Expands the types of constraints you can express - see
890
- https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
891
- for details
892
- :param tolerations: Tolerations are applied to pods, and allow (but do not require) the pods to schedule
893
- onto nodes with matching taints - see
894
- https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration
895
- for details
1014
+ Configure Kubernetes node scheduling for this function.
1015
+
1016
+ Updates one or more scheduling hints: exact node pinning, label-based selection,
1017
+ affinity/anti-affinity rules, and taint tolerations. Passing ``None`` leaves the
1018
+ current value unchanged; pass an empty dict/list (e.g., ``{}``, ``[]``) to clear.
896
1019
 
1020
+ :param node_name: Exact Kubernetes node name to pin the pod to.
1021
+ :param node_selector: Mapping of label selectors. Use ``{}`` to clear.
1022
+ :param affinity: :class:`kubernetes.client.V1Affinity` constraints.
1023
+ :param tolerations: List of :class:`kubernetes.client.V1Toleration`. Use ``[]`` to clear.
1024
+ :warns: PreemptionWarning - Emitted if provided selectors/tolerations/affinity
1025
+ conflict with the function's preemption mode.
1026
+
1027
+ Example usage:
1028
+ Prefer a GPU pool and allow scheduling on spot nodes::
1029
+
1030
+ job.with_node_selection(
1031
+ node_selector={"nodepool": "gpu"},
1032
+ tolerations=[k8s_client.V1Toleration(key="spot", operator="Exists")],
1033
+ )
897
1034
  """
898
1035
  if node_name:
899
1036
  self.spec.node_name = node_name
@@ -904,6 +1041,11 @@ class KubeResource(BaseRuntime):
904
1041
  self.spec.affinity = affinity
905
1042
  if tolerations is not None:
906
1043
  self.spec.tolerations = tolerations
1044
+ self.raise_preemptible_warning(
1045
+ node_selector=self.spec.node_selector,
1046
+ tolerations=self.spec.tolerations,
1047
+ affinity=self.spec.affinity,
1048
+ )
907
1049
 
908
1050
  def with_priority_class(self, name: typing.Optional[str] = None):
909
1051
  """
mlrun/serving/routers.py CHANGED
@@ -31,6 +31,9 @@ import mlrun.common.model_monitoring
31
31
  import mlrun.common.schemas.model_monitoring
32
32
  from mlrun.utils import logger, now_date
33
33
 
34
+ from ..common.model_monitoring.helpers import (
35
+ get_model_endpoints_creation_task_status,
36
+ )
34
37
  from .utils import RouterToDict, _extract_input_data, _update_result_body
35
38
  from .v2_serving import _ModelLogPusher
36
39
 
@@ -171,46 +174,6 @@ class BaseModelRouter(RouterToDict):
171
174
  """run tasks after processing the event"""
172
175
  return event
173
176
 
174
- def _get_background_task_status(
175
- self,
176
- ) -> mlrun.common.schemas.BackgroundTaskState:
177
- self._background_task_check_timestamp = now_date()
178
- server: mlrun.serving.GraphServer = getattr(
179
- self.context, "_server", None
180
- ) or getattr(self.context, "server", None)
181
- if not self.context.is_mock:
182
- if server.model_endpoint_creation_task_name:
183
- background_task = mlrun.get_run_db().get_project_background_task(
184
- server.project, server.model_endpoint_creation_task_name
185
- )
186
- logger.debug(
187
- "Checking model endpoint creation task status",
188
- task_name=server.model_endpoint_creation_task_name,
189
- )
190
- if (
191
- background_task.status.state
192
- in mlrun.common.schemas.BackgroundTaskState.terminal_states()
193
- ):
194
- logger.info(
195
- f"Model endpoint creation task completed with state {background_task.status.state}"
196
- )
197
- else: # in progress
198
- logger.info(
199
- f"Model endpoint creation task is still in progress with the current state: "
200
- f"{background_task.status.state}. Events will not be monitored for the next "
201
- f"{mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period} seconds",
202
- name=self.name,
203
- background_task_check_timestamp=self._background_task_check_timestamp.isoformat(),
204
- )
205
- return background_task.status.state
206
- else:
207
- logger.error(
208
- "Model endpoint creation task name not provided. This function is not being monitored.",
209
- )
210
- elif self.context.monitoring_mock:
211
- return mlrun.common.schemas.BackgroundTaskState.succeeded
212
- return mlrun.common.schemas.BackgroundTaskState.failed
213
-
214
177
  def _update_background_task_state(self, event):
215
178
  if not self.background_task_reached_terminal_state and (
216
179
  self._background_task_check_timestamp is None
@@ -219,7 +182,26 @@ class BaseModelRouter(RouterToDict):
219
182
  seconds=mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period
220
183
  )
221
184
  ):
222
- self._background_task_current_state = self._get_background_task_status()
185
+ server: mlrun.serving.GraphServer = getattr(
186
+ self.context, "_server", None
187
+ ) or getattr(self.context, "server", None)
188
+ if not self.context.is_mock:
189
+ (
190
+ self._background_task_current_state,
191
+ self._background_task_check_timestamp,
192
+ _,
193
+ ) = get_model_endpoints_creation_task_status(server)
194
+ elif self.context.monitoring_mock:
195
+ self._background_task_current_state = (
196
+ mlrun.common.schemas.BackgroundTaskState.succeeded
197
+ )
198
+ self._background_task_check_timestamp = mlrun.utils.now_date()
199
+ else:
200
+ self._background_task_current_state = (
201
+ mlrun.common.schemas.BackgroundTaskState.failed
202
+ )
203
+ self._background_task_check_timestamp = mlrun.utils.now_date()
204
+
223
205
  if event.body:
224
206
  event.body["background_task_state"] = (
225
207
  self._background_task_current_state
mlrun/serving/server.py CHANGED
@@ -417,6 +417,7 @@ def add_monitoring_general_steps(
417
417
  "mlrun.serving.system_steps.BackgroundTaskStatus",
418
418
  "background_task_status_step",
419
419
  model_endpoint_creation_strategy=mlrun.common.schemas.ModelEndpointCreationStrategy.SKIP,
420
+ full_event=True,
420
421
  )
421
422
  monitor_flow_step = graph.add_step(
422
423
  "storey.Filter",
mlrun/serving/states.py CHANGED
@@ -39,7 +39,7 @@ import mlrun.common.schemas as schemas
39
39
  from mlrun.artifacts.llm_prompt import LLMPromptArtifact, PlaceholderDefaultDict
40
40
  from mlrun.artifacts.model import ModelArtifact
41
41
  from mlrun.datastore.datastore_profile import (
42
- DatastoreProfileKafkaSource,
42
+ DatastoreProfileKafkaStream,
43
43
  DatastoreProfileKafkaTarget,
44
44
  DatastoreProfileV3io,
45
45
  datastore_profile_read,
@@ -3398,7 +3398,7 @@ def _init_async_objects(context, steps):
3398
3398
  datastore_profile = datastore_profile_read(stream_path)
3399
3399
  if isinstance(
3400
3400
  datastore_profile,
3401
- (DatastoreProfileKafkaTarget, DatastoreProfileKafkaSource),
3401
+ (DatastoreProfileKafkaTarget, DatastoreProfileKafkaStream),
3402
3402
  ):
3403
3403
  step._async_object = KafkaStoreyTarget(
3404
3404
  path=stream_path,
@@ -3414,7 +3414,7 @@ def _init_async_objects(context, steps):
3414
3414
  else:
3415
3415
  raise mlrun.errors.MLRunValueError(
3416
3416
  f"Received an unexpected stream profile type: {type(datastore_profile)}\n"
3417
- "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaSource`."
3417
+ "Expects `DatastoreProfileV3io` or `DatastoreProfileKafkaStream`."
3418
3418
  )
3419
3419
  elif stream_path.startswith("kafka://") or kafka_brokers:
3420
3420
  topic, brokers = parse_kafka_url(stream_path, kafka_brokers)