mlrun 1.10.0rc24__py3-none-any.whl → 1.10.0rc26__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (50) hide show
  1. mlrun/artifacts/llm_prompt.py +8 -1
  2. mlrun/common/model_monitoring/helpers.py +86 -0
  3. mlrun/common/schemas/hub.py +25 -18
  4. mlrun/common/schemas/model_monitoring/constants.py +1 -0
  5. mlrun/common/schemas/model_monitoring/model_endpoints.py +10 -1
  6. mlrun/config.py +2 -3
  7. mlrun/datastore/__init__.py +2 -2
  8. mlrun/datastore/azure_blob.py +66 -43
  9. mlrun/datastore/datastore_profile.py +35 -5
  10. mlrun/datastore/model_provider/huggingface_provider.py +122 -30
  11. mlrun/datastore/model_provider/model_provider.py +62 -4
  12. mlrun/datastore/model_provider/openai_provider.py +114 -43
  13. mlrun/datastore/s3.py +24 -2
  14. mlrun/datastore/storeytargets.py +2 -3
  15. mlrun/db/base.py +15 -1
  16. mlrun/db/httpdb.py +17 -6
  17. mlrun/db/nopdb.py +14 -0
  18. mlrun/k8s_utils.py +0 -14
  19. mlrun/model_monitoring/api.py +2 -2
  20. mlrun/model_monitoring/applications/base.py +37 -10
  21. mlrun/model_monitoring/applications/context.py +1 -4
  22. mlrun/model_monitoring/controller.py +15 -5
  23. mlrun/model_monitoring/db/_schedules.py +2 -4
  24. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +3 -1
  25. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
  26. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +3 -0
  27. mlrun/model_monitoring/helpers.py +5 -5
  28. mlrun/platforms/iguazio.py +7 -3
  29. mlrun/projects/project.py +33 -29
  30. mlrun/runtimes/base.py +0 -3
  31. mlrun/runtimes/mounts.py +15 -2
  32. mlrun/runtimes/nuclio/__init__.py +1 -0
  33. mlrun/runtimes/nuclio/application/application.py +11 -2
  34. mlrun/runtimes/nuclio/function.py +10 -0
  35. mlrun/runtimes/nuclio/serving.py +4 -0
  36. mlrun/runtimes/pod.py +153 -11
  37. mlrun/runtimes/utils.py +22 -5
  38. mlrun/serving/routers.py +23 -41
  39. mlrun/serving/server.py +26 -14
  40. mlrun/serving/states.py +3 -3
  41. mlrun/serving/system_steps.py +52 -29
  42. mlrun/serving/v2_serving.py +9 -10
  43. mlrun/utils/helpers.py +5 -2
  44. mlrun/utils/version/version.json +2 -2
  45. {mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/METADATA +24 -23
  46. {mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/RECORD +50 -50
  47. {mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/WHEEL +0 -0
  48. {mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/entry_points.txt +0 -0
  49. {mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/licenses/LICENSE +0 -0
  50. {mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/top_level.txt +0 -0
mlrun/projects/project.py CHANGED
@@ -2749,16 +2749,18 @@ class MlrunProject(ModelObj):
2749
2749
  | Creating a function with non project source is done by specifying a module ``handler`` and on the
2750
2750
  returned function set the source with ``function.with_source_archive(<source>)``.
2751
2751
 
2752
- Support URL prefixes:
2752
+ Supported URL prefixes:
2753
2753
 
2754
- | Object (s3://, v3io://, ..)
2755
- | MLRun DB e.g. db://project/func:ver
2756
- | Functions hub/market: e.g. hub://auto-trainer:master
2754
+ - Object: s3://, v3io://, etc.
2755
+ - MLRun DB: e.g db://project/func:ver
2756
+ - Function hub/market: e.g. hub://auto-trainer:master
2757
2757
 
2758
2758
  Examples::
2759
2759
 
2760
2760
  proj.set_function(func_object)
2761
- proj.set_function("http://.../mynb.ipynb", "train")
2761
+ proj.set_function(
2762
+ "http://.../mynb.ipynb", "train", kind="job", image="mlrun/mlrun"
2763
+ )
2762
2764
  proj.set_function("./func.yaml")
2763
2765
  proj.set_function("hub://get_toy_data", "getdata")
2764
2766
 
@@ -2785,18 +2787,6 @@ class MlrunProject(ModelObj):
2785
2787
  # By providing a path to a pip requirements file
2786
2788
  proj.set_function("my.py", requirements="requirements.txt")
2787
2789
 
2788
- One of the most important parameters is 'kind', used to specify the chosen runtime. The options are:
2789
- - local: execute a local python or shell script
2790
- - job: insert the code into a Kubernetes pod and execute it
2791
- - nuclio: insert the code into a real-time serverless nuclio function
2792
- - serving: insert code into orchestrated nuclio function(s) forming a DAG
2793
- - dask: run the specified python code / script as Dask Distributed job
2794
- - mpijob: run distributed Horovod jobs over the MPI job operator
2795
- - spark: run distributed Spark job using Spark Kubernetes Operator
2796
- - remote-spark: run distributed Spark job on remote Spark service
2797
- - databricks: run code on Databricks cluster (python scripts, Spark etc.)
2798
- - application: run a long living application (e.g. a web server, UI, etc.)
2799
-
2800
2790
  Learn more about :doc:`../../concepts/functions-overview`.
2801
2791
 
2802
2792
  :param func: Function object or spec/code url, None refers to current Notebook
@@ -2804,8 +2794,20 @@ class MlrunProject(ModelObj):
2804
2794
  Versions (e.g. myfunc:v1). If the `tag` parameter is provided, the tag in the name
2805
2795
  must match the tag parameter.
2806
2796
  Specifying a tag in the name will update the project's tagged function (myfunc:v1)
2807
- :param kind: Runtime kind e.g. job, nuclio, spark, dask, mpijob
2808
- Default: job
2797
+ :param kind: Default: job. One of
2798
+
2799
+ - local: execute a local python or shell script
2800
+ - job: insert the code into a Kubernetes pod and execute it
2801
+ - nuclio: insert the code into a real-time serverless nuclio function
2802
+ - serving: insert code into orchestrated nuclio function(s) forming a DAG
2803
+ - dask: run the specified python code / script as Dask Distributed job
2804
+ - mpijob: run distributed Horovod jobs over the MPI job operator
2805
+ - spark: run distributed Spark job using Spark Kubernetes Operator
2806
+ - remote-spark: run distributed Spark job on remote Spark service
2807
+ - databricks: run code on Databricks cluster (python scripts, Spark etc.)
2808
+ - application: run a long living application (e.g. a web server, UI, etc.)
2809
+ - handler: execute a python handler (used automatically in notebooks or for debug)
2810
+
2809
2811
  :param image: Docker image to be used, can also be specified in the function object/yaml
2810
2812
  :param handler: Default function handler to invoke (can only be set with .py/.ipynb files)
2811
2813
  :param with_repo: Add (clone) the current repo to the build source - use when the function code is in
@@ -3814,7 +3816,7 @@ class MlrunProject(ModelObj):
3814
3816
 
3815
3817
  import mlrun
3816
3818
  from mlrun.datastore.datastore_profile import (
3817
- DatastoreProfileKafkaSource,
3819
+ DatastoreProfileKafkaStream,
3818
3820
  DatastoreProfileTDEngine,
3819
3821
  )
3820
3822
 
@@ -3831,7 +3833,7 @@ class MlrunProject(ModelObj):
3831
3833
  project.register_datastore_profile(tsdb_profile)
3832
3834
 
3833
3835
  # Create and register stream profile
3834
- stream_profile = DatastoreProfileKafkaSource(
3836
+ stream_profile = DatastoreProfileKafkaStream(
3835
3837
  name="my-kafka",
3836
3838
  brokers=["<kafka-broker-ip-address>:9094"],
3837
3839
  topics=[], # Keep the topics list empty
@@ -3873,9 +3875,9 @@ class MlrunProject(ModelObj):
3873
3875
 
3874
3876
  .. code-block:: python
3875
3877
 
3876
- from mlrun.datastore.datastore_profile import DatastoreProfileKafkaSource
3878
+ from mlrun.datastore.datastore_profile import DatastoreProfileKafkaStream
3877
3879
 
3878
- stream_profile = DatastoreProfileKafkaSource(
3880
+ stream_profile = DatastoreProfileKafkaStream(
3879
3881
  name="confluent-kafka",
3880
3882
  brokers=["<server-domain-start>.confluent.cloud:9092"],
3881
3883
  topics=[],
@@ -3904,7 +3906,7 @@ class MlrunProject(ModelObj):
3904
3906
  The supported profiles are:
3905
3907
 
3906
3908
  * :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileV3io`
3907
- * :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource`
3909
+ * :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream`
3908
3910
 
3909
3911
  You need to register one of them, and pass the profile's name.
3910
3912
  :param replace_creds: If ``True`` - override the existing credentials.
@@ -3944,7 +3946,9 @@ class MlrunProject(ModelObj):
3944
3946
  start: Optional[datetime.datetime] = None,
3945
3947
  end: Optional[datetime.datetime] = None,
3946
3948
  top_level: bool = False,
3947
- mode: Optional[mlrun.common.schemas.EndpointMode] = None,
3949
+ modes: Optional[
3950
+ Union[mm_constants.EndpointMode, list[mm_constants.EndpointMode]]
3951
+ ] = None,
3948
3952
  uids: Optional[list[str]] = None,
3949
3953
  latest_only: bool = False,
3950
3954
  tsdb_metrics: bool = False,
@@ -3960,7 +3964,7 @@ class MlrunProject(ModelObj):
3960
3964
  5) function_tag
3961
3965
  6) labels
3962
3966
  7) top level
3963
- 8) mode
3967
+ 8) modes
3964
3968
  9) uids
3965
3969
  10) start and end time, corresponding to the `created` field.
3966
3970
  By default, when no filters are applied, all available endpoints for the given project will be listed.
@@ -3982,8 +3986,8 @@ class MlrunProject(ModelObj):
3982
3986
  :param start: The start time to filter by.Corresponding to the `created` field.
3983
3987
  :param end: The end time to filter by. Corresponding to the `created` field.
3984
3988
  :param top_level: If true will return only routers and endpoint that are NOT children of any router.
3985
- :param mode: Specifies the mode of the model endpoint. Can be "real-time" (0), "batch" (1), or
3986
- both if set to None.
3989
+ :param modes: Specifies the mode of the model endpoint. Can be "real-time" (0), "batch" (1),
3990
+ "batch_legacy" (2). If set to None, all are included.
3987
3991
  :param uids: If passed will return a list `ModelEndpoint` object with uid in uids.
3988
3992
  :param tsdb_metrics: When True, the time series metrics will be added to the output
3989
3993
  of the resulting.
@@ -4005,7 +4009,7 @@ class MlrunProject(ModelObj):
4005
4009
  start=start,
4006
4010
  end=end,
4007
4011
  top_level=top_level,
4008
- mode=mode,
4012
+ modes=modes,
4009
4013
  uids=uids,
4010
4014
  latest_only=latest_only,
4011
4015
  tsdb_metrics=tsdb_metrics,
mlrun/runtimes/base.py CHANGED
@@ -142,9 +142,6 @@ class FunctionSpec(ModelObj):
142
142
  def build(self, build):
143
143
  self._build = self._verify_dict(build, "build", ImageBuilder)
144
144
 
145
- def enrich_function_preemption_spec(self):
146
- pass
147
-
148
145
  def validate_service_account(self, allowed_service_accounts):
149
146
  pass
150
147
 
mlrun/runtimes/mounts.py CHANGED
@@ -14,6 +14,7 @@
14
14
 
15
15
  import os
16
16
  import typing
17
+ import warnings
17
18
  from collections import namedtuple
18
19
 
19
20
  from mlrun.config import config
@@ -247,10 +248,22 @@ def mount_s3(
247
248
  def _use_s3_cred(runtime: "KubeResource"):
248
249
  _access_key = aws_access_key or os.environ.get(prefix + "AWS_ACCESS_KEY_ID")
249
250
  _secret_key = aws_secret_key or os.environ.get(prefix + "AWS_SECRET_ACCESS_KEY")
250
- _endpoint_url = endpoint_url or os.environ.get(prefix + "S3_ENDPOINT_URL")
251
+
252
+ # Check for endpoint URL with backward compatibility
253
+ _endpoint_url = endpoint_url or os.environ.get(prefix + "AWS_ENDPOINT_URL_S3")
254
+ if not _endpoint_url:
255
+ # Check for deprecated environment variable
256
+ _endpoint_url = os.environ.get(prefix + "S3_ENDPOINT_URL")
257
+ if _endpoint_url:
258
+ warnings.warn(
259
+ "S3_ENDPOINT_URL is deprecated in 1.10.0 and will be removed in 1.12.0, "
260
+ "use AWS_ENDPOINT_URL_S3 instead.",
261
+ # TODO: Remove this in 1.12.0
262
+ FutureWarning,
263
+ )
251
264
 
252
265
  if _endpoint_url:
253
- runtime.set_env(prefix + "S3_ENDPOINT_URL", _endpoint_url)
266
+ runtime.set_env(prefix + "AWS_ENDPOINT_URL_S3", _endpoint_url)
254
267
  if aws_region:
255
268
  runtime.set_env(prefix + "AWS_REGION", aws_region)
256
269
  if non_anonymous:
@@ -16,6 +16,7 @@ from .serving import ServingRuntime, new_v2_model_server # noqa
16
16
  from .nuclio import nuclio_init_hook # noqa
17
17
  from .function import (
18
18
  min_nuclio_versions,
19
+ multiple_port_sidecar_is_supported,
19
20
  RemoteRuntime,
20
21
  ) # noqa
21
22
  from .api_gateway import APIGateway
@@ -22,7 +22,10 @@ import mlrun.errors
22
22
  import mlrun.run
23
23
  from mlrun.common.runtimes.constants import NuclioIngressAddTemplatedIngressModes
24
24
  from mlrun.runtimes import RemoteRuntime
25
- from mlrun.runtimes.nuclio import min_nuclio_versions
25
+ from mlrun.runtimes.nuclio import (
26
+ min_nuclio_versions,
27
+ multiple_port_sidecar_is_supported,
28
+ )
26
29
  from mlrun.runtimes.nuclio.api_gateway import (
27
30
  APIGateway,
28
31
  APIGatewayMetadata,
@@ -182,7 +185,13 @@ class ApplicationSpec(NuclioSpec):
182
185
  if port != self.internal_application_port:
183
186
  cleaned_ports.append(port)
184
187
 
185
- self._application_ports = [self.internal_application_port] + cleaned_ports
188
+ application_ports = [self.internal_application_port] + cleaned_ports
189
+
190
+ # ensure multiple ports are supported in Nuclio
191
+ if len(application_ports) > 1:
192
+ multiple_port_sidecar_is_supported()
193
+
194
+ self._application_ports = application_ports
186
195
 
187
196
  @property
188
197
  def internal_application_port(self):
@@ -1045,6 +1045,9 @@ class RemoteRuntime(KubeResource):
1045
1045
  sidecar["image"] = image
1046
1046
 
1047
1047
  ports = mlrun.utils.helpers.as_list(ports)
1048
+ if len(ports) > 1:
1049
+ mlrun.runtimes.nuclio.multiple_port_sidecar_is_supported()
1050
+
1048
1051
  # according to RFC-6335, port name should be less than 15 characters,
1049
1052
  # so we truncate it if needed and leave room for the index
1050
1053
  port_name = name[:13].rstrip("-_") if len(name) > 13 else name
@@ -1458,3 +1461,10 @@ def enrich_nuclio_function_from_headers(
1458
1461
  else []
1459
1462
  )
1460
1463
  func.status.container_image = headers.get("x-mlrun-container-image", "")
1464
+
1465
+
1466
+ @min_nuclio_versions("1.14.14")
1467
+ def multiple_port_sidecar_is_supported():
1468
+ # multiple ports are supported from nuclio version 1.14.14
1469
+ # this method exists only for running the min_nuclio_versions decorator
1470
+ return True
@@ -22,6 +22,7 @@ from nuclio import KafkaTrigger
22
22
 
23
23
  import mlrun
24
24
  import mlrun.common.schemas as schemas
25
+ import mlrun.datastore.datastore_profile as ds_profile
25
26
  from mlrun.datastore import get_kafka_brokers_from_dict, parse_kafka_url
26
27
  from mlrun.model import ObjectList
27
28
  from mlrun.runtimes.function_reference import FunctionReference
@@ -740,6 +741,7 @@ class ServingRuntime(RemoteRuntime):
740
741
  current_function="*",
741
742
  track_models=False,
742
743
  workdir=None,
744
+ stream_profile: Optional[ds_profile.DatastoreProfile] = None,
743
745
  **kwargs,
744
746
  ) -> GraphServer:
745
747
  """create mock server object for local testing/emulation
@@ -748,6 +750,7 @@ class ServingRuntime(RemoteRuntime):
748
750
  :param current_function: specify if you want to simulate a child function, * for all functions
749
751
  :param track_models: allow model tracking (disabled by default in the mock server)
750
752
  :param workdir: working directory to locate the source code (if not the current one)
753
+ :param stream_profile: stream profile to use for the mock server output stream.
751
754
  """
752
755
 
753
756
  # set the namespaces/modules to look for the steps code in
@@ -787,6 +790,7 @@ class ServingRuntime(RemoteRuntime):
787
790
  logger=logger,
788
791
  is_mock=True,
789
792
  monitoring_mock=self.spec.track_models,
793
+ stream_profile=stream_profile,
790
794
  )
791
795
 
792
796
  server.graph = add_system_steps_to_graph(
mlrun/runtimes/pod.py CHANGED
@@ -17,6 +17,7 @@ import os
17
17
  import re
18
18
  import time
19
19
  import typing
20
+ import warnings
20
21
  from collections.abc import Iterable
21
22
  from enum import Enum
22
23
 
@@ -35,6 +36,7 @@ from mlrun.common.schemas import (
35
36
 
36
37
  from ..config import config as mlconf
37
38
  from ..k8s_utils import (
39
+ generate_preemptible_nodes_affinity_terms,
38
40
  validate_node_selectors,
39
41
  )
40
42
  from ..utils import logger, update_in
@@ -874,6 +876,133 @@ class KubeResource(BaseRuntime):
874
876
  """
875
877
  self.spec.with_requests(mem, cpu, patch=patch)
876
878
 
879
+ @staticmethod
880
+ def detect_preemptible_node_selector(node_selector: dict[str, str]) -> list[str]:
881
+ """
882
+ Check whether any provided node selector matches preemptible selectors.
883
+
884
+ :param node_selector: User-provided node selector mapping.
885
+ :return: List of `"key='value'"` strings that match a preemptible selector.
886
+ """
887
+ preemptible_node_selector = mlconf.get_preemptible_node_selector()
888
+
889
+ return [
890
+ f"'{key}': '{val}'"
891
+ for key, val in node_selector.items()
892
+ if preemptible_node_selector.get(key) == val
893
+ ]
894
+
895
+ def detect_preemptible_tolerations(
896
+ self, tolerations: list[k8s_client.V1Toleration]
897
+ ) -> list[str]:
898
+ """
899
+ Check whether any provided toleration matches preemptible tolerations.
900
+
901
+ :param tolerations: User-provided tolerations.
902
+ :return: List of formatted toleration strings that are considered preemptible.
903
+ """
904
+ preemptible_tolerations = [
905
+ k8s_client.V1Toleration(
906
+ key=toleration.get("key"),
907
+ value=toleration.get("value"),
908
+ effect=toleration.get("effect"),
909
+ )
910
+ for toleration in mlconf.get_preemptible_tolerations()
911
+ ]
912
+
913
+ def _format_toleration(toleration):
914
+ return f"'{toleration.key}'='{toleration.value}' (effect: '{toleration.effect}')"
915
+
916
+ return [
917
+ _format_toleration(toleration)
918
+ for toleration in tolerations
919
+ if toleration in preemptible_tolerations
920
+ ]
921
+
922
+ def detect_preemptible_affinity(self, affinity: k8s_client.V1Affinity) -> list[str]:
923
+ """
924
+ Check whether any provided affinity rules match preemptible affinity configs.
925
+
926
+ :param affinity: User-provided affinity object.
927
+ :return: List of formatted expressions that overlap with preemptible terms.
928
+ """
929
+ preemptible_affinity_terms = generate_preemptible_nodes_affinity_terms()
930
+ conflicting_affinities = []
931
+
932
+ if (
933
+ affinity
934
+ and affinity.node_affinity
935
+ and affinity.node_affinity.required_during_scheduling_ignored_during_execution
936
+ ):
937
+ user_terms = affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms
938
+ for user_term in user_terms:
939
+ user_expressions = {
940
+ (expr.key, expr.operator, tuple(expr.values or []))
941
+ for expr in user_term.match_expressions or []
942
+ }
943
+
944
+ for preemptible_term in preemptible_affinity_terms:
945
+ preemptible_expressions = {
946
+ (expr.key, expr.operator, tuple(expr.values or []))
947
+ for expr in preemptible_term.match_expressions or []
948
+ }
949
+
950
+ # Ensure operators match and preemptible expressions are present
951
+ common_exprs = user_expressions & preemptible_expressions
952
+ if common_exprs:
953
+ formatted = ", ".join(
954
+ f"'{key} {operator} {list(values)}'"
955
+ for key, operator, values in common_exprs
956
+ )
957
+ conflicting_affinities.append(formatted)
958
+ return conflicting_affinities
959
+
960
+ def raise_preemptible_warning(
961
+ self,
962
+ node_selector: typing.Optional[dict[str, str]],
963
+ tolerations: typing.Optional[list[k8s_client.V1Toleration]],
964
+ affinity: typing.Optional[k8s_client.V1Affinity],
965
+ ) -> None:
966
+ """
967
+ Detect conflicts and emit a single consolidated warning if needed.
968
+
969
+ :param node_selector: User-provided node selector.
970
+ :param tolerations: User-provided tolerations.
971
+ :param affinity: User-provided affinity.
972
+ :warns: PreemptionWarning - Emitted when any of the provided selectors,
973
+ tolerations, or affinity terms match the configured preemptible
974
+ settings. The message lists the conflicting items.
975
+ """
976
+ conflict_messages = []
977
+
978
+ if node_selector:
979
+ ns_conflicts = ", ".join(
980
+ self.detect_preemptible_node_selector(node_selector)
981
+ )
982
+ if ns_conflicts:
983
+ conflict_messages.append(f"Node selectors: {ns_conflicts}")
984
+
985
+ if tolerations:
986
+ tol_conflicts = ", ".join(self.detect_preemptible_tolerations(tolerations))
987
+ if tol_conflicts:
988
+ conflict_messages.append(f"Tolerations: {tol_conflicts}")
989
+
990
+ if affinity:
991
+ affinity_conflicts = ", ".join(self.detect_preemptible_affinity(affinity))
992
+ if affinity_conflicts:
993
+ conflict_messages.append(f"Affinity: {affinity_conflicts}")
994
+
995
+ if conflict_messages:
996
+ warning_componentes = "; \n".join(conflict_messages)
997
+ warnings.warn(
998
+ f"Warning: based on MLRun's preemptible node configuration, the following components \n"
999
+ f"may be removed or adjusted at runtime:\n"
1000
+ f"{warning_componentes}.\n"
1001
+ "This adjustment depends on the function's preemption mode. \n"
1002
+ "The list of potential adjusted preemptible selectors can be viewed here: "
1003
+ "mlrun.mlconf.get_preemptible_node_selector() and mlrun.mlconf.get_preemptible_tolerations()."
1004
+ )
1005
+
877
1006
  def with_node_selection(
878
1007
  self,
879
1008
  node_name: typing.Optional[str] = None,
@@ -882,18 +1011,26 @@ class KubeResource(BaseRuntime):
882
1011
  tolerations: typing.Optional[list[k8s_client.V1Toleration]] = None,
883
1012
  ):
884
1013
  """
885
- Enables to control on which k8s node the job will run
886
-
887
- :param node_name: The name of the k8s node
888
- :param node_selector: Label selector, only nodes with matching labels will be eligible to be picked
889
- :param affinity: Expands the types of constraints you can express - see
890
- https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
891
- for details
892
- :param tolerations: Tolerations are applied to pods, and allow (but do not require) the pods to schedule
893
- onto nodes with matching taints - see
894
- https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration
895
- for details
1014
+ Configure Kubernetes node scheduling for this function.
1015
+
1016
+ Updates one or more scheduling hints: exact node pinning, label-based selection,
1017
+ affinity/anti-affinity rules, and taint tolerations. Passing ``None`` leaves the
1018
+ current value unchanged; pass an empty dict/list (e.g., ``{}``, ``[]``) to clear.
896
1019
 
1020
+ :param node_name: Exact Kubernetes node name to pin the pod to.
1021
+ :param node_selector: Mapping of label selectors. Use ``{}`` to clear.
1022
+ :param affinity: :class:`kubernetes.client.V1Affinity` constraints.
1023
+ :param tolerations: List of :class:`kubernetes.client.V1Toleration`. Use ``[]`` to clear.
1024
+ :warns: PreemptionWarning - Emitted if provided selectors/tolerations/affinity
1025
+ conflict with the function's preemption mode.
1026
+
1027
+ Example usage:
1028
+ Prefer a GPU pool and allow scheduling on spot nodes::
1029
+
1030
+ job.with_node_selection(
1031
+ node_selector={"nodepool": "gpu"},
1032
+ tolerations=[k8s_client.V1Toleration(key="spot", operator="Exists")],
1033
+ )
897
1034
  """
898
1035
  if node_name:
899
1036
  self.spec.node_name = node_name
@@ -904,6 +1041,11 @@ class KubeResource(BaseRuntime):
904
1041
  self.spec.affinity = affinity
905
1042
  if tolerations is not None:
906
1043
  self.spec.tolerations = tolerations
1044
+ self.raise_preemptible_warning(
1045
+ node_selector=self.spec.node_selector,
1046
+ tolerations=self.spec.tolerations,
1047
+ affinity=self.spec.affinity,
1048
+ )
907
1049
 
908
1050
  def with_priority_class(self, name: typing.Optional[str] = None):
909
1051
  """
mlrun/runtimes/utils.py CHANGED
@@ -26,6 +26,7 @@ import pandas as pd
26
26
  import mlrun
27
27
  import mlrun.common.constants
28
28
  import mlrun.common.constants as mlrun_constants
29
+ import mlrun.common.runtimes.constants
29
30
  import mlrun.common.schemas
30
31
  import mlrun.utils.regex
31
32
  from mlrun.artifacts import TableArtifact
@@ -153,6 +154,7 @@ def results_to_iter(results, runspec, execution):
153
154
 
154
155
  iter = []
155
156
  failed = 0
157
+ pending_retry = 0
156
158
  running = 0
157
159
  for task in results:
158
160
  if task:
@@ -164,17 +166,26 @@ def results_to_iter(results, runspec, execution):
164
166
  "state": state,
165
167
  "iter": id,
166
168
  }
167
- if state == "error":
169
+ if state == mlrun.common.runtimes.constants.RunStates.error:
168
170
  failed += 1
169
171
  err = get_in(task, ["status", "error"], "")
170
- logger.error(f"error in task {execution.uid}:{id} - {err_to_str(err)}")
171
- elif state != "completed":
172
+ logger.error(f"error in task {execution.uid}:{id} - {err_to_str(err)}")
173
+ elif state == mlrun.common.runtimes.constants.RunStates.pending_retry:
174
+ pending_retry += 1
175
+ err = get_in(task, ["status", "error"], "")
176
+ retry_count = get_in(task, ["status", "retry_count"], 0)
177
+ logger.warning(
178
+ f"pending retry in task {execution.uid}:{id} - {err_to_str(err)}. Retry count: {retry_count}"
179
+ )
180
+ elif state != mlrun.common.runtimes.constants.RunStates.completed:
172
181
  running += 1
173
182
 
174
183
  iter.append(struct)
175
184
 
176
185
  if not iter:
177
- execution.set_state("completed", commit=True)
186
+ execution.set_state(
187
+ mlrun.common.runtimes.constants.RunStates.completed, commit=True
188
+ )
178
189
  logger.warning("Warning!, zero iteration results")
179
190
  return
180
191
  if hasattr(pd, "json_normalize"):
@@ -204,8 +215,14 @@ def results_to_iter(results, runspec, execution):
204
215
  error=f"{failed} of {len(results)} tasks failed, check logs in db for details",
205
216
  commit=False,
206
217
  )
218
+ elif pending_retry:
219
+ execution.set_state(
220
+ mlrun.common.runtimes.constants.RunStates.pending_retry, commit=False
221
+ )
207
222
  elif running == 0:
208
- execution.set_state("completed", commit=False)
223
+ execution.set_state(
224
+ mlrun.common.runtimes.constants.RunStates.completed, commit=False
225
+ )
209
226
  execution.commit()
210
227
 
211
228
 
mlrun/serving/routers.py CHANGED
@@ -31,6 +31,9 @@ import mlrun.common.model_monitoring
31
31
  import mlrun.common.schemas.model_monitoring
32
32
  from mlrun.utils import logger, now_date
33
33
 
34
+ from ..common.model_monitoring.helpers import (
35
+ get_model_endpoints_creation_task_status,
36
+ )
34
37
  from .utils import RouterToDict, _extract_input_data, _update_result_body
35
38
  from .v2_serving import _ModelLogPusher
36
39
 
@@ -171,46 +174,6 @@ class BaseModelRouter(RouterToDict):
171
174
  """run tasks after processing the event"""
172
175
  return event
173
176
 
174
- def _get_background_task_status(
175
- self,
176
- ) -> mlrun.common.schemas.BackgroundTaskState:
177
- self._background_task_check_timestamp = now_date()
178
- server: mlrun.serving.GraphServer = getattr(
179
- self.context, "_server", None
180
- ) or getattr(self.context, "server", None)
181
- if not self.context.is_mock:
182
- if server.model_endpoint_creation_task_name:
183
- background_task = mlrun.get_run_db().get_project_background_task(
184
- server.project, server.model_endpoint_creation_task_name
185
- )
186
- logger.debug(
187
- "Checking model endpoint creation task status",
188
- task_name=server.model_endpoint_creation_task_name,
189
- )
190
- if (
191
- background_task.status.state
192
- in mlrun.common.schemas.BackgroundTaskState.terminal_states()
193
- ):
194
- logger.info(
195
- f"Model endpoint creation task completed with state {background_task.status.state}"
196
- )
197
- else: # in progress
198
- logger.info(
199
- f"Model endpoint creation task is still in progress with the current state: "
200
- f"{background_task.status.state}. Events will not be monitored for the next "
201
- f"{mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period} seconds",
202
- name=self.name,
203
- background_task_check_timestamp=self._background_task_check_timestamp.isoformat(),
204
- )
205
- return background_task.status.state
206
- else:
207
- logger.error(
208
- "Model endpoint creation task name not provided. This function is not being monitored.",
209
- )
210
- elif self.context.monitoring_mock:
211
- return mlrun.common.schemas.BackgroundTaskState.succeeded
212
- return mlrun.common.schemas.BackgroundTaskState.failed
213
-
214
177
  def _update_background_task_state(self, event):
215
178
  if not self.background_task_reached_terminal_state and (
216
179
  self._background_task_check_timestamp is None
@@ -219,7 +182,26 @@ class BaseModelRouter(RouterToDict):
219
182
  seconds=mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period
220
183
  )
221
184
  ):
222
- self._background_task_current_state = self._get_background_task_status()
185
+ server: mlrun.serving.GraphServer = getattr(
186
+ self.context, "_server", None
187
+ ) or getattr(self.context, "server", None)
188
+ if not self.context.is_mock:
189
+ (
190
+ self._background_task_current_state,
191
+ self._background_task_check_timestamp,
192
+ _,
193
+ ) = get_model_endpoints_creation_task_status(server)
194
+ elif self.context.monitoring_mock:
195
+ self._background_task_current_state = (
196
+ mlrun.common.schemas.BackgroundTaskState.succeeded
197
+ )
198
+ self._background_task_check_timestamp = mlrun.utils.now_date()
199
+ else:
200
+ self._background_task_current_state = (
201
+ mlrun.common.schemas.BackgroundTaskState.failed
202
+ )
203
+ self._background_task_check_timestamp = mlrun.utils.now_date()
204
+
223
205
  if event.body:
224
206
  event.body["background_task_state"] = (
225
207
  self._background_task_current_state