PyPI - mlrun - Versions diffs - 1.10.0rc24__py3-none-any.whl → 1.10.0rc26__py3-none-any.whl - Mend

mlrun 1.10.0rc24py3-none-any.whl → 1.10.0rc26py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (50) hide show

mlrun/artifacts/llm_prompt.py +8 -1
mlrun/common/model_monitoring/helpers.py +86 -0
mlrun/common/schemas/hub.py +25 -18
mlrun/common/schemas/model_monitoring/constants.py +1 -0
mlrun/common/schemas/model_monitoring/model_endpoints.py +10 -1
mlrun/config.py +2 -3
mlrun/datastore/__init__.py +2 -2
mlrun/datastore/azure_blob.py +66 -43
mlrun/datastore/datastore_profile.py +35 -5
mlrun/datastore/model_provider/huggingface_provider.py +122 -30
mlrun/datastore/model_provider/model_provider.py +62 -4
mlrun/datastore/model_provider/openai_provider.py +114 -43
mlrun/datastore/s3.py +24 -2
mlrun/datastore/storeytargets.py +2 -3
mlrun/db/base.py +15 -1
mlrun/db/httpdb.py +17 -6
mlrun/db/nopdb.py +14 -0
mlrun/k8s_utils.py +0 -14
mlrun/model_monitoring/api.py +2 -2
mlrun/model_monitoring/applications/base.py +37 -10
mlrun/model_monitoring/applications/context.py +1 -4
mlrun/model_monitoring/controller.py +15 -5
mlrun/model_monitoring/db/_schedules.py +2 -4
mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +3 -1
mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +3 -0
mlrun/model_monitoring/helpers.py +5 -5
mlrun/platforms/iguazio.py +7 -3
mlrun/projects/project.py +33 -29
mlrun/runtimes/base.py +0 -3
mlrun/runtimes/mounts.py +15 -2
mlrun/runtimes/nuclio/__init__.py +1 -0
mlrun/runtimes/nuclio/application/application.py +11 -2
mlrun/runtimes/nuclio/function.py +10 -0
mlrun/runtimes/nuclio/serving.py +4 -0
mlrun/runtimes/pod.py +153 -11
mlrun/runtimes/utils.py +22 -5
mlrun/serving/routers.py +23 -41
mlrun/serving/server.py +26 -14
mlrun/serving/states.py +3 -3
mlrun/serving/system_steps.py +52 -29
mlrun/serving/v2_serving.py +9 -10
mlrun/utils/helpers.py +5 -2
mlrun/utils/version/version.json +2 -2
{mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/METADATA +24 -23
{mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/RECORD +50 -50
{mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/WHEEL +0 -0
{mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/entry_points.txt +0 -0
{mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/licenses/LICENSE +0 -0
{mlrun-1.10.0rc24.dist-info → mlrun-1.10.0rc26.dist-info}/top_level.txt +0 -0

mlrun/projects/project.py CHANGED Viewed

@@ -2749,16 +2749,18 @@ class MlrunProject(ModelObj):
         | Creating a function with non project source is done by specifying a module ``handler`` and on the
          returned function set the source with ``function.with_source_archive(<source>)``.
-        Support URL prefixes:
+        Supported URL prefixes:
-            | Object (s3://, v3io://, ..)
-            | MLRun DB e.g. db://project/func:ver
-            | Functions hub/market: e.g. hub://auto-trainer:master
+        - Object: s3://, v3io://, etc.
+        - MLRun DB: e.g db://project/func:ver
+        - Function hub/market: e.g. hub://auto-trainer:master
         Examples::
             proj.set_function(func_object)
-            proj.set_function("http://.../mynb.ipynb", "train")
+            proj.set_function(
+                "http://.../mynb.ipynb", "train", kind="job", image="mlrun/mlrun"
+            )
             proj.set_function("./func.yaml")
             proj.set_function("hub://get_toy_data", "getdata")
@@ -2785,18 +2787,6 @@ class MlrunProject(ModelObj):
             # By providing a path to a pip requirements file
             proj.set_function("my.py", requirements="requirements.txt")
-        One of the most important parameters is 'kind', used to specify the chosen runtime. The options are:
-           - local: execute a local python or shell script
-           - job: insert the code into a Kubernetes pod and execute it
-           - nuclio: insert the code into a real-time serverless nuclio function
-           - serving: insert code into orchestrated nuclio function(s) forming a DAG
-           - dask: run the specified python code / script as Dask Distributed job
-           - mpijob: run distributed Horovod jobs over the MPI job operator
-           - spark: run distributed Spark job using Spark Kubernetes Operator
-           - remote-spark: run distributed Spark job on remote Spark service
-           - databricks: run code on Databricks cluster (python scripts, Spark etc.)
-           - application: run a long living application (e.g. a web server, UI, etc.)
         Learn more about :doc:`../../concepts/functions-overview`.
         :param func:                Function object or spec/code url, None refers to current Notebook
@@ -2804,8 +2794,20 @@ class MlrunProject(ModelObj):
                                     Versions (e.g. myfunc:v1). If the `tag` parameter is provided, the tag in the name
                                     must match the tag parameter.
                                     Specifying a tag in the name will update the project's tagged function (myfunc:v1)
-        :param kind:                Runtime kind e.g. job, nuclio, spark, dask, mpijob
-                                    Default: job
+        :param kind:                Default: job. One of
+                          - local: execute a local python or shell script
+                          - job: insert the code into a Kubernetes pod and execute it
+                          - nuclio: insert the code into a real-time serverless nuclio function
+                          - serving: insert code into orchestrated nuclio function(s) forming a DAG
+                          - dask: run the specified python code / script as Dask Distributed job
+                          - mpijob: run distributed Horovod jobs over the MPI job operator
+                          - spark: run distributed Spark job using Spark Kubernetes Operator
+                          - remote-spark: run distributed Spark job on remote Spark service
+                          - databricks: run code on Databricks cluster (python scripts, Spark etc.)
+                          - application: run a long living application (e.g. a web server, UI, etc.)
+                          - handler: execute a python handler (used automatically in notebooks or for debug)
         :param image:               Docker image to be used, can also be specified in the function object/yaml
         :param handler:             Default function handler to invoke (can only be set with .py/.ipynb files)
         :param with_repo:           Add (clone) the current repo to the build source - use when the function code is in
@@ -3814,7 +3816,7 @@ class MlrunProject(ModelObj):
             import mlrun
             from mlrun.datastore.datastore_profile import (
-                DatastoreProfileKafkaSource,
+                DatastoreProfileKafkaStream,
                 DatastoreProfileTDEngine,
             )
@@ -3831,7 +3833,7 @@ class MlrunProject(ModelObj):
             project.register_datastore_profile(tsdb_profile)
             # Create and register stream profile
-            stream_profile = DatastoreProfileKafkaSource(
+            stream_profile = DatastoreProfileKafkaStream(
                 name="my-kafka",
                 brokers=["<kafka-broker-ip-address>:9094"],
                 topics=[],  # Keep the topics list empty
@@ -3873,9 +3875,9 @@ class MlrunProject(ModelObj):
         .. code-block:: python
-            from mlrun.datastore.datastore_profile import DatastoreProfileKafkaSource
+            from mlrun.datastore.datastore_profile import DatastoreProfileKafkaStream
-            stream_profile = DatastoreProfileKafkaSource(
+            stream_profile = DatastoreProfileKafkaStream(
                 name="confluent-kafka",
                 brokers=["<server-domain-start>.confluent.cloud:9092"],
                 topics=[],
@@ -3904,7 +3906,7 @@ class MlrunProject(ModelObj):
                                           The supported profiles are:
                                           * :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileV3io`
-                                          * :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileKafkaSource`
+                                          * :py:class:`~mlrun.datastore.datastore_profile.DatastoreProfileKafkaStream`
                                           You need to register one of them, and pass the profile's name.
         :param replace_creds:             If ``True`` - override the existing credentials.
@@ -3944,7 +3946,9 @@ class MlrunProject(ModelObj):
         start: Optional[datetime.datetime] = None,
         end: Optional[datetime.datetime] = None,
         top_level: bool = False,
-        mode: Optional[mlrun.common.schemas.EndpointMode] = None,
+        modes: Optional[
+            Union[mm_constants.EndpointMode, list[mm_constants.EndpointMode]]
+        ] = None,
         uids: Optional[list[str]] = None,
         latest_only: bool = False,
         tsdb_metrics: bool = False,
@@ -3960,7 +3964,7 @@ class MlrunProject(ModelObj):
         5) function_tag
         6) labels
         7) top level
-        8) mode
+        8) modes
         9) uids
         10) start and end time, corresponding to the `created` field.
         By default, when no filters are applied, all available endpoints for the given project will be listed.
@@ -3982,8 +3986,8 @@ class MlrunProject(ModelObj):
         :param start:           The start time to filter by.Corresponding to the `created` field.
         :param end:             The end time to filter by. Corresponding to the `created` field.
         :param top_level:       If true will return only routers and endpoint that are NOT children of any router.
-        :param mode:            Specifies the mode of the model endpoint. Can be "real-time" (0), "batch" (1), or
-                                both if set to None.
+        :param modes:           Specifies the mode of the model endpoint. Can be "real-time" (0), "batch" (1),
+                                "batch_legacy" (2). If set to None, all are included.
         :param uids:            If passed will return a list `ModelEndpoint` object with uid in uids.
         :param tsdb_metrics:    When True, the time series metrics will be added to the output
                                 of the resulting.
@@ -4005,7 +4009,7 @@ class MlrunProject(ModelObj):
             start=start,
             end=end,
             top_level=top_level,
-            mode=mode,
+            modes=modes,
             uids=uids,
             latest_only=latest_only,
             tsdb_metrics=tsdb_metrics,

mlrun/runtimes/base.py CHANGED Viewed

@@ -142,9 +142,6 @@ class FunctionSpec(ModelObj):
     def build(self, build):
         self._build = self._verify_dict(build, "build", ImageBuilder)
-    def enrich_function_preemption_spec(self):
-        pass
     def validate_service_account(self, allowed_service_accounts):
         pass

mlrun/runtimes/mounts.py CHANGED Viewed

@@ -14,6 +14,7 @@
 import os
 import typing
+import warnings
 from collections import namedtuple
 from mlrun.config import config
@@ -247,10 +248,22 @@ def mount_s3(
     def _use_s3_cred(runtime: "KubeResource"):
         _access_key = aws_access_key or os.environ.get(prefix + "AWS_ACCESS_KEY_ID")
         _secret_key = aws_secret_key or os.environ.get(prefix + "AWS_SECRET_ACCESS_KEY")
-        _endpoint_url = endpoint_url or os.environ.get(prefix + "S3_ENDPOINT_URL")
+        # Check for endpoint URL with backward compatibility
+        _endpoint_url = endpoint_url or os.environ.get(prefix + "AWS_ENDPOINT_URL_S3")
+        if not _endpoint_url:
+            # Check for deprecated environment variable
+            _endpoint_url = os.environ.get(prefix + "S3_ENDPOINT_URL")
+            if _endpoint_url:
+                warnings.warn(
+                    "S3_ENDPOINT_URL is deprecated in 1.10.0 and will be removed in 1.12.0, "
+                    "use AWS_ENDPOINT_URL_S3 instead.",
+                    # TODO: Remove this in 1.12.0
+                    FutureWarning,
+                )
         if _endpoint_url:
-            runtime.set_env(prefix + "S3_ENDPOINT_URL", _endpoint_url)
+            runtime.set_env(prefix + "AWS_ENDPOINT_URL_S3", _endpoint_url)
         if aws_region:
             runtime.set_env(prefix + "AWS_REGION", aws_region)
         if non_anonymous:

mlrun/runtimes/nuclio/__init__.py CHANGED Viewed

@@ -16,6 +16,7 @@ from .serving import ServingRuntime, new_v2_model_server  # noqa
 from .nuclio import nuclio_init_hook  # noqa
 from .function import (
     min_nuclio_versions,
+    multiple_port_sidecar_is_supported,
     RemoteRuntime,
 )  # noqa
 from .api_gateway import APIGateway

mlrun/runtimes/nuclio/application/application.py CHANGED Viewed

@@ -22,7 +22,10 @@ import mlrun.errors
 import mlrun.run
 from mlrun.common.runtimes.constants import NuclioIngressAddTemplatedIngressModes
 from mlrun.runtimes import RemoteRuntime
-from mlrun.runtimes.nuclio import min_nuclio_versions
+from mlrun.runtimes.nuclio import (
+    min_nuclio_versions,
+    multiple_port_sidecar_is_supported,
+)
 from mlrun.runtimes.nuclio.api_gateway import (
     APIGateway,
     APIGatewayMetadata,
@@ -182,7 +185,13 @@ class ApplicationSpec(NuclioSpec):
             if port != self.internal_application_port:
                 cleaned_ports.append(port)
-        self._application_ports = [self.internal_application_port] + cleaned_ports
+        application_ports = [self.internal_application_port] + cleaned_ports
+        # ensure multiple ports are supported in Nuclio
+        if len(application_ports) > 1:
+            multiple_port_sidecar_is_supported()
+        self._application_ports = application_ports
     @property
     def internal_application_port(self):

mlrun/runtimes/nuclio/function.py CHANGED Viewed

@@ -1045,6 +1045,9 @@ class RemoteRuntime(KubeResource):
             sidecar["image"] = image
         ports = mlrun.utils.helpers.as_list(ports)
+        if len(ports) > 1:
+            mlrun.runtimes.nuclio.multiple_port_sidecar_is_supported()
         # according to RFC-6335, port name should be less than 15 characters,
         # so we truncate it if needed and leave room for the index
         port_name = name[:13].rstrip("-_") if len(name) > 13 else name
@@ -1458,3 +1461,10 @@ def enrich_nuclio_function_from_headers(
         else []
     )
     func.status.container_image = headers.get("x-mlrun-container-image", "")
+@min_nuclio_versions("1.14.14")
+def multiple_port_sidecar_is_supported():
+    # multiple ports are supported from nuclio version 1.14.14
+    # this method exists only for running the min_nuclio_versions decorator
+    return True

mlrun/runtimes/nuclio/serving.py CHANGED Viewed

@@ -22,6 +22,7 @@ from nuclio import KafkaTrigger
 import mlrun
 import mlrun.common.schemas as schemas
+import mlrun.datastore.datastore_profile as ds_profile
 from mlrun.datastore import get_kafka_brokers_from_dict, parse_kafka_url
 from mlrun.model import ObjectList
 from mlrun.runtimes.function_reference import FunctionReference
@@ -740,6 +741,7 @@ class ServingRuntime(RemoteRuntime):
         current_function="*",
         track_models=False,
         workdir=None,
+        stream_profile: Optional[ds_profile.DatastoreProfile] = None,
         **kwargs,
     ) -> GraphServer:
         """create mock server object for local testing/emulation
@@ -748,6 +750,7 @@ class ServingRuntime(RemoteRuntime):
         :param current_function: specify if you want to simulate a child function, * for all functions
         :param track_models: allow model tracking (disabled by default in the mock server)
         :param workdir:   working directory to locate the source code (if not the current one)
+        :param stream_profile:   stream profile to use for the mock server output stream.
         """
         # set the namespaces/modules to look for the steps code in
@@ -787,6 +790,7 @@ class ServingRuntime(RemoteRuntime):
             logger=logger,
             is_mock=True,
             monitoring_mock=self.spec.track_models,
+            stream_profile=stream_profile,
         )
         server.graph = add_system_steps_to_graph(

mlrun/runtimes/pod.py CHANGED Viewed

@@ -17,6 +17,7 @@ import os
 import re
 import time
 import typing
+import warnings
 from collections.abc import Iterable
 from enum import Enum
@@ -35,6 +36,7 @@ from mlrun.common.schemas import (
 from ..config import config as mlconf
 from ..k8s_utils import (
+    generate_preemptible_nodes_affinity_terms,
     validate_node_selectors,
 )
 from ..utils import logger, update_in
@@ -874,6 +876,133 @@ class KubeResource(BaseRuntime):
         """
         self.spec.with_requests(mem, cpu, patch=patch)
+    @staticmethod
+    def detect_preemptible_node_selector(node_selector: dict[str, str]) -> list[str]:
+        """
+        Check whether any provided node selector matches preemptible selectors.
+        :param node_selector: User-provided node selector mapping.
+        :return: List of `"key='value'"` strings that match a preemptible selector.
+        """
+        preemptible_node_selector = mlconf.get_preemptible_node_selector()
+        return [
+            f"'{key}': '{val}'"
+            for key, val in node_selector.items()
+            if preemptible_node_selector.get(key) == val
+        ]
+    def detect_preemptible_tolerations(
+        self, tolerations: list[k8s_client.V1Toleration]
+    ) -> list[str]:
+        """
+        Check whether any provided toleration matches preemptible tolerations.
+        :param tolerations: User-provided tolerations.
+        :return: List of formatted toleration strings that are considered preemptible.
+        """
+        preemptible_tolerations = [
+            k8s_client.V1Toleration(
+                key=toleration.get("key"),
+                value=toleration.get("value"),
+                effect=toleration.get("effect"),
+            )
+            for toleration in mlconf.get_preemptible_tolerations()
+        ]
+        def _format_toleration(toleration):
+            return f"'{toleration.key}'='{toleration.value}' (effect: '{toleration.effect}')"
+        return [
+            _format_toleration(toleration)
+            for toleration in tolerations
+            if toleration in preemptible_tolerations
+        ]
+    def detect_preemptible_affinity(self, affinity: k8s_client.V1Affinity) -> list[str]:
+        """
+        Check whether any provided affinity rules match preemptible affinity configs.
+        :param affinity: User-provided affinity object.
+        :return: List of formatted expressions that overlap with preemptible terms.
+        """
+        preemptible_affinity_terms = generate_preemptible_nodes_affinity_terms()
+        conflicting_affinities = []
+        if (
+            affinity
+            and affinity.node_affinity
+            and affinity.node_affinity.required_during_scheduling_ignored_during_execution
+        ):
+            user_terms = affinity.node_affinity.required_during_scheduling_ignored_during_execution.node_selector_terms
+            for user_term in user_terms:
+                user_expressions = {
+                    (expr.key, expr.operator, tuple(expr.values or []))
+                    for expr in user_term.match_expressions or []
+                }
+                for preemptible_term in preemptible_affinity_terms:
+                    preemptible_expressions = {
+                        (expr.key, expr.operator, tuple(expr.values or []))
+                        for expr in preemptible_term.match_expressions or []
+                    }
+                    # Ensure operators match and preemptible expressions are present
+                    common_exprs = user_expressions & preemptible_expressions
+                    if common_exprs:
+                        formatted = ", ".join(
+                            f"'{key}  {operator}  {list(values)}'"
+                            for key, operator, values in common_exprs
+                        )
+                        conflicting_affinities.append(formatted)
+        return conflicting_affinities
+    def raise_preemptible_warning(
+        self,
+        node_selector: typing.Optional[dict[str, str]],
+        tolerations: typing.Optional[list[k8s_client.V1Toleration]],
+        affinity: typing.Optional[k8s_client.V1Affinity],
+    ) -> None:
+        """
+        Detect conflicts and emit a single consolidated warning if needed.
+        :param node_selector: User-provided node selector.
+        :param tolerations: User-provided tolerations.
+        :param affinity: User-provided affinity.
+        :warns: PreemptionWarning - Emitted when any of the provided selectors,
+                tolerations, or affinity terms match the configured preemptible
+                settings. The message lists the conflicting items.
+        """
+        conflict_messages = []
+        if node_selector:
+            ns_conflicts = ", ".join(
+                self.detect_preemptible_node_selector(node_selector)
+            )
+            if ns_conflicts:
+                conflict_messages.append(f"Node selectors: {ns_conflicts}")
+        if tolerations:
+            tol_conflicts = ", ".join(self.detect_preemptible_tolerations(tolerations))
+            if tol_conflicts:
+                conflict_messages.append(f"Tolerations: {tol_conflicts}")
+        if affinity:
+            affinity_conflicts = ", ".join(self.detect_preemptible_affinity(affinity))
+            if affinity_conflicts:
+                conflict_messages.append(f"Affinity: {affinity_conflicts}")
+        if conflict_messages:
+            warning_componentes = "; \n".join(conflict_messages)
+            warnings.warn(
+                f"Warning: based on MLRun's preemptible node configuration, the following components \n"
+                f"may be removed or adjusted at runtime:\n"
+                f"{warning_componentes}.\n"
+                "This adjustment depends on the function's preemption mode. \n"
+                "The list of potential adjusted preemptible selectors can be viewed here: "
+                "mlrun.mlconf.get_preemptible_node_selector() and mlrun.mlconf.get_preemptible_tolerations()."
+            )
     def with_node_selection(
         self,
         node_name: typing.Optional[str] = None,
@@ -882,18 +1011,26 @@ class KubeResource(BaseRuntime):
         tolerations: typing.Optional[list[k8s_client.V1Toleration]] = None,
     ):
         """
-        Enables to control on which k8s node the job will run
-        :param node_name:       The name of the k8s node
-        :param node_selector:   Label selector, only nodes with matching labels will be eligible to be picked
-        :param affinity:        Expands the types of constraints you can express - see
-                                https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#affinity-and-anti-affinity
-                                for details
-        :param tolerations:     Tolerations are applied to pods, and allow (but do not require) the pods to schedule
-                                onto nodes with matching taints - see
-                                https://kubernetes.io/docs/concepts/scheduling-eviction/taint-and-toleration
-                                for details
+        Configure Kubernetes node scheduling for this function.
+        Updates one or more scheduling hints: exact node pinning, label-based selection,
+        affinity/anti-affinity rules, and taint tolerations. Passing ``None`` leaves the
+        current value unchanged; pass an empty dict/list (e.g., ``{}``, ``[]``) to clear.
+        :param node_name: Exact Kubernetes node name to pin the pod to.
+        :param node_selector: Mapping of label selectors. Use ``{}`` to clear.
+        :param affinity: :class:`kubernetes.client.V1Affinity` constraints.
+        :param tolerations: List of :class:`kubernetes.client.V1Toleration`. Use ``[]`` to clear.
+        :warns: PreemptionWarning - Emitted if provided selectors/tolerations/affinity
+                conflict with the function's preemption mode.
+        Example usage:
+            Prefer a GPU pool and allow scheduling on spot nodes::
+                job.with_node_selection(
+                    node_selector={"nodepool": "gpu"},
+                    tolerations=[k8s_client.V1Toleration(key="spot", operator="Exists")],
+                )
         """
         if node_name:
             self.spec.node_name = node_name
@@ -904,6 +1041,11 @@ class KubeResource(BaseRuntime):
             self.spec.affinity = affinity
         if tolerations is not None:
             self.spec.tolerations = tolerations
+        self.raise_preemptible_warning(
+            node_selector=self.spec.node_selector,
+            tolerations=self.spec.tolerations,
+            affinity=self.spec.affinity,
+        )
     def with_priority_class(self, name: typing.Optional[str] = None):
         """

mlrun/runtimes/utils.py CHANGED Viewed

@@ -26,6 +26,7 @@ import pandas as pd
 import mlrun
 import mlrun.common.constants
 import mlrun.common.constants as mlrun_constants
+import mlrun.common.runtimes.constants
 import mlrun.common.schemas
 import mlrun.utils.regex
 from mlrun.artifacts import TableArtifact
@@ -153,6 +154,7 @@ def results_to_iter(results, runspec, execution):
     iter = []
     failed = 0
+    pending_retry = 0
     running = 0
     for task in results:
         if task:
@@ -164,17 +166,26 @@ def results_to_iter(results, runspec, execution):
                 "state": state,
                 "iter": id,
             }
-            if state == "error":
+            if state == mlrun.common.runtimes.constants.RunStates.error:
                 failed += 1
                 err = get_in(task, ["status", "error"], "")
-                logger.error(f"error in task  {execution.uid}:{id} - {err_to_str(err)}")
-            elif state != "completed":
+                logger.error(f"error in task {execution.uid}:{id} - {err_to_str(err)}")
+            elif state == mlrun.common.runtimes.constants.RunStates.pending_retry:
+                pending_retry += 1
+                err = get_in(task, ["status", "error"], "")
+                retry_count = get_in(task, ["status", "retry_count"], 0)
+                logger.warning(
+                    f"pending retry in task {execution.uid}:{id} - {err_to_str(err)}. Retry count: {retry_count}"
+                )
+            elif state != mlrun.common.runtimes.constants.RunStates.completed:
                 running += 1
             iter.append(struct)
     if not iter:
-        execution.set_state("completed", commit=True)
+        execution.set_state(
+            mlrun.common.runtimes.constants.RunStates.completed, commit=True
+        )
         logger.warning("Warning!, zero iteration results")
         return
     if hasattr(pd, "json_normalize"):
@@ -204,8 +215,14 @@ def results_to_iter(results, runspec, execution):
             error=f"{failed} of {len(results)} tasks failed, check logs in db for details",
             commit=False,
         )
+    elif pending_retry:
+        execution.set_state(
+            mlrun.common.runtimes.constants.RunStates.pending_retry, commit=False
+        )
     elif running == 0:
-        execution.set_state("completed", commit=False)
+        execution.set_state(
+            mlrun.common.runtimes.constants.RunStates.completed, commit=False
+        )
     execution.commit()

mlrun/serving/routers.py CHANGED Viewed

@@ -31,6 +31,9 @@ import mlrun.common.model_monitoring
 import mlrun.common.schemas.model_monitoring
 from mlrun.utils import logger, now_date
+from ..common.model_monitoring.helpers import (
+    get_model_endpoints_creation_task_status,
+)
 from .utils import RouterToDict, _extract_input_data, _update_result_body
 from .v2_serving import _ModelLogPusher
@@ -171,46 +174,6 @@ class BaseModelRouter(RouterToDict):
         """run tasks after processing the event"""
         return event
-    def _get_background_task_status(
-        self,
-    ) -> mlrun.common.schemas.BackgroundTaskState:
-        self._background_task_check_timestamp = now_date()
-        server: mlrun.serving.GraphServer = getattr(
-            self.context, "_server", None
-        ) or getattr(self.context, "server", None)
-        if not self.context.is_mock:
-            if server.model_endpoint_creation_task_name:
-                background_task = mlrun.get_run_db().get_project_background_task(
-                    server.project, server.model_endpoint_creation_task_name
-                )
-                logger.debug(
-                    "Checking model endpoint creation task status",
-                    task_name=server.model_endpoint_creation_task_name,
-                )
-                if (
-                    background_task.status.state
-                    in mlrun.common.schemas.BackgroundTaskState.terminal_states()
-                ):
-                    logger.info(
-                        f"Model endpoint creation task completed with state {background_task.status.state}"
-                    )
-                else:  # in progress
-                    logger.info(
-                        f"Model endpoint creation task is still in progress with the current state: "
-                        f"{background_task.status.state}. Events will not be monitored for the next "
-                        f"{mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period} seconds",
-                        name=self.name,
-                        background_task_check_timestamp=self._background_task_check_timestamp.isoformat(),
-                    )
-                return background_task.status.state
-            else:
-                logger.error(
-                    "Model endpoint creation task name not provided. This function is not being monitored.",
-                )
-        elif self.context.monitoring_mock:
-            return mlrun.common.schemas.BackgroundTaskState.succeeded
-        return mlrun.common.schemas.BackgroundTaskState.failed
     def _update_background_task_state(self, event):
         if not self.background_task_reached_terminal_state and (
             self._background_task_check_timestamp is None
@@ -219,7 +182,26 @@ class BaseModelRouter(RouterToDict):
                 seconds=mlrun.mlconf.model_endpoint_monitoring.model_endpoint_creation_check_period
             )
         ):
-            self._background_task_current_state = self._get_background_task_status()
+            server: mlrun.serving.GraphServer = getattr(
+                self.context, "_server", None
+            ) or getattr(self.context, "server", None)
+            if not self.context.is_mock:
+                (
+                    self._background_task_current_state,
+                    self._background_task_check_timestamp,
+                    _,
+                ) = get_model_endpoints_creation_task_status(server)
+            elif self.context.monitoring_mock:
+                self._background_task_current_state = (
+                    mlrun.common.schemas.BackgroundTaskState.succeeded
+                )
+                self._background_task_check_timestamp = mlrun.utils.now_date()
+            else:
+                self._background_task_current_state = (
+                    mlrun.common.schemas.BackgroundTaskState.failed
+                )
+                self._background_task_check_timestamp = mlrun.utils.now_date()
         if event.body:
             event.body["background_task_state"] = (
                 self._background_task_current_state

mlrun 1.10.0rc24__py3-none-any.whl → 1.10.0rc26__py3-none-any.whl

Potentially problematic release.

mlrun 1.10.0rc24py3-none-any.whl → 1.10.0rc26py3-none-any.whl