PyPI - mlrun - Versions diffs - 1.10.0rc11__py3-none-any.whl → 1.10.0rc12__py3-none-any.whl - Mend

mlrun 1.10.0rc11py3-none-any.whl → 1.10.0rc12py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (54) hide show

mlrun/__init__.py +2 -1
mlrun/__main__.py +7 -1
mlrun/artifacts/base.py +9 -3
mlrun/artifacts/dataset.py +2 -1
mlrun/artifacts/llm_prompt.py +1 -1
mlrun/artifacts/model.py +2 -2
mlrun/common/constants.py +1 -0
mlrun/common/runtimes/constants.py +10 -1
mlrun/config.py +19 -2
mlrun/datastore/__init__.py +3 -1
mlrun/datastore/alibaba_oss.py +1 -1
mlrun/datastore/azure_blob.py +1 -1
mlrun/datastore/base.py +6 -31
mlrun/datastore/datastore.py +109 -33
mlrun/datastore/datastore_profile.py +31 -0
mlrun/datastore/dbfs_store.py +1 -1
mlrun/datastore/google_cloud_storage.py +2 -2
mlrun/datastore/model_provider/__init__.py +13 -0
mlrun/datastore/model_provider/model_provider.py +82 -0
mlrun/datastore/model_provider/openai_provider.py +120 -0
mlrun/datastore/remote_client.py +54 -0
mlrun/datastore/s3.py +1 -1
mlrun/datastore/storeytargets.py +1 -1
mlrun/datastore/utils.py +22 -0
mlrun/datastore/v3io.py +1 -1
mlrun/db/base.py +1 -1
mlrun/db/httpdb.py +9 -4
mlrun/db/nopdb.py +1 -1
mlrun/execution.py +23 -7
mlrun/launcher/base.py +23 -13
mlrun/launcher/local.py +3 -1
mlrun/launcher/remote.py +4 -2
mlrun/model.py +65 -0
mlrun/package/packagers_manager.py +2 -0
mlrun/projects/operations.py +8 -1
mlrun/projects/project.py +23 -5
mlrun/run.py +17 -0
mlrun/runtimes/__init__.py +6 -0
mlrun/runtimes/base.py +24 -6
mlrun/runtimes/daskjob.py +1 -0
mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
mlrun/runtimes/local.py +1 -6
mlrun/serving/server.py +0 -2
mlrun/serving/states.py +30 -5
mlrun/serving/system_steps.py +22 -28
mlrun/utils/helpers.py +13 -2
mlrun/utils/notifications/notification_pusher.py +15 -0
mlrun/utils/version/version.json +2 -2
{mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc12.dist-info}/METADATA +2 -2
{mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc12.dist-info}/RECORD +54 -50
{mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc12.dist-info}/WHEEL +0 -0
{mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc12.dist-info}/entry_points.txt +0 -0
{mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc12.dist-info}/licenses/LICENSE +0 -0
{mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc12.dist-info}/top_level.txt +0 -0

mlrun/datastore/model_provider/model_provider.py ADDED Viewed

@@ -0,0 +1,82 @@
+# Copyright 2025 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from collections.abc import Awaitable
+from typing import Callable, Optional, TypeVar
+import mlrun.errors
+from mlrun.datastore.remote_client import (
+    BaseRemoteClient,
+)
+T = TypeVar("T")
+class ModelProvider(BaseRemoteClient):
+    support_async = False
+    def __init__(
+        self,
+        parent,
+        kind,
+        name,
+        endpoint="",
+        secrets: Optional[dict] = None,
+        default_invoke_kwargs: Optional[dict] = None,
+    ):
+        super().__init__(
+            parent=parent, name=name, kind=kind, endpoint=endpoint, secrets=secrets
+        )
+        self.default_invoke_kwargs = default_invoke_kwargs or {}
+        self._client = None
+        self._default_operation = None
+        self._async_client = None
+        self._default_async_operation = None
+    def load_client(self) -> None:
+        raise NotImplementedError("load_client method is not implemented")
+    def invoke(self, prompt: Optional[str] = None, **invoke_kwargs) -> str:
+        raise NotImplementedError("invoke method is not implemented")
+    def customized_invoke(
+        self, operation: Optional[Callable[..., T]] = None, **invoke_kwargs
+    ) -> Optional[T]:
+        raise NotImplementedError("customized_invoke method is not implemented")
+    @property
+    def client(self):
+        return self._client
+    @property
+    def model(self):
+        return None
+    def get_invoke_kwargs(self, invoke_kwargs):
+        kwargs = self.default_invoke_kwargs.copy()
+        kwargs.update(invoke_kwargs)
+        return kwargs
+    @property
+    def async_client(self):
+        if not self.support_async:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                f"{self.__class__.__name__} does not support async operations"
+            )
+        return self._async_client
+    async def async_customized_invoke(self, **kwargs):
+        raise NotImplementedError("async_customized_invoke is not implemented")
+    async def async_invoke(self, prompt: str, **invoke_kwargs) -> Awaitable[str]:
+        raise NotImplementedError("async_invoke is not implemented")

mlrun/datastore/model_provider/openai_provider.py ADDED Viewed

@@ -0,0 +1,120 @@
+# Copyright 2025 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Callable, Optional, TypeVar
+import mlrun
+from mlrun.datastore.model_provider.model_provider import ModelProvider
+T = TypeVar("T")
+class OpenAIProvider(ModelProvider):
+    def __init__(
+        self,
+        parent,
+        schema,
+        name,
+        endpoint="",
+        secrets: Optional[dict] = None,
+        default_invoke_kwargs: Optional[dict] = None,
+    ):
+        endpoint = endpoint or mlrun.mlconf.model_providers.openai_default_model
+        if schema != "openai":
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "OpenAIProvider supports only 'openai' as the provider kind."
+            )
+        super().__init__(
+            parent=parent,
+            kind=schema,
+            name=name,
+            endpoint=endpoint,
+            secrets=secrets,
+            default_invoke_kwargs=default_invoke_kwargs,
+        )
+        self.options = self.get_client_options()
+        self.load_client()
+    @classmethod
+    def parse_endpoint_and_path(cls, endpoint, subpath) -> (str, str):
+        if endpoint and subpath:
+            endpoint = endpoint + subpath
+            #  in openai there is no usage of subpath variable. if the model contains "/", it is part of the model name.
+            subpath = ""
+        return endpoint, subpath
+    @property
+    def model(self):
+        return self.endpoint
+    def load_client(self) -> None:
+        try:
+            from openai import OpenAI  # noqa
+            self._client = OpenAI(**self.options)
+            self._default_operation = self.client.chat.completions.create
+        except ImportError as exc:
+            raise ImportError("openai package is not installed") from exc
+    def get_client_options(self):
+        res = dict(
+            api_key=self._get_secret_or_env("OPENAI_API_KEY"),
+            organization=self._get_secret_or_env("OPENAI_ORG_ID"),
+            project=self._get_secret_or_env("OPENAI_PROJECT_ID"),
+            base_url=self._get_secret_or_env("OPENAI_BASE_URL"),
+            timeout=self._get_secret_or_env("OPENAI_TIMEOUT"),
+            max_retries=self._get_secret_or_env("OPENAI_MAX_RETRIES"),
+        )
+        return self._sanitize_options(res)
+    def customized_invoke(
+        self, operation: Optional[Callable[..., T]] = None, **invoke_kwargs
+    ) -> Optional[T]:
+        invoke_kwargs = self.get_invoke_kwargs(invoke_kwargs)
+        if operation:
+            return operation(**invoke_kwargs, model=self.model)
+        else:
+            return self._default_operation(**invoke_kwargs, model=self.model)
+    def _get_messages_parameter(
+        self, prompt: Optional[str] = None, **invoke_kwargs
+    ) -> (str, dict):
+        invoke_kwargs = self.get_invoke_kwargs(invoke_kwargs)
+        messages = invoke_kwargs.get("messages")
+        if messages:
+            if prompt:
+                raise mlrun.errors.MLRunInvalidArgumentError(
+                    "can not provide 'messages' and 'prompt' to invoke"
+                )
+        elif prompt:
+            messages = [
+                {
+                    "role": "user",
+                    "content": prompt,
+                },
+            ]
+        else:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "must provide 'messages' or 'prompt' to invoke"
+            )
+        return messages, invoke_kwargs
+    def invoke(self, prompt: Optional[str] = None, **invoke_kwargs) -> str:
+        messages, invoke_kwargs = self._get_messages_parameter(
+            prompt=prompt, **invoke_kwargs
+        )
+        response = self._default_operation(
+            model=self.endpoint, messages=messages, **invoke_kwargs
+        )
+        return response.choices[0].message.content

mlrun/datastore/remote_client.py ADDED Viewed

@@ -0,0 +1,54 @@
+# Copyright 2025 Iguazio
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import Optional
+import mlrun
+class BaseRemoteClient:
+    def __init__(self, parent, kind, name, endpoint="", secrets: Optional[dict] = None):
+        self._parent = parent
+        self.kind = kind
+        self.name = name
+        self.endpoint = endpoint
+        self._secrets = secrets or {}
+        self.secret_pfx = ""
+    def _get_secret_or_env(self, key, default=None):
+        # Project-secrets are mounted as env variables whose name can be retrieved from SecretsStore
+        return mlrun.get_secret_or_env(
+            key, secret_provider=self._get_secret, default=default
+        )
+    def _get_parent_secret(self, key):
+        return self._parent.secret(self.secret_pfx + key)
+    def _get_secret(self, key: str, default=None):
+        return self._secrets.get(key, default) or self._get_parent_secret(key)
+    @property
+    def url(self):
+        return f"{self.kind}://{self.endpoint}"
+    @staticmethod
+    def _sanitize_options(options):
+        if not options:
+            return {}
+        options = {k: v for k, v in options.items() if v is not None and v != ""}
+        return options
+    @classmethod
+    def parse_endpoint_and_path(cls, endpoint, subpath) -> (str, str):
+        return endpoint, subpath

mlrun/datastore/s3.py CHANGED Viewed

@@ -186,7 +186,7 @@ class S3Store(DataStore):
         if profile:
             storage_options["profile"] = profile
-        return self._sanitize_storage_options(storage_options)
+        return self._sanitize_options(storage_options)
     @property
     def spark_url(self):

mlrun/datastore/storeytargets.py CHANGED Viewed

@@ -46,7 +46,7 @@ def get_url_and_storage_options(path, external_storage_options=None):
         storage_options = merge(external_storage_options, storage_options)
     else:
         storage_options = storage_options or external_storage_options
-    return url, DataStore._sanitize_storage_options(storage_options)
+    return url, DataStore._sanitize_options(storage_options)
 class TDEngineStoreyTarget(storey.TDEngineTarget):

mlrun/datastore/utils.py CHANGED Viewed

@@ -311,3 +311,25 @@ class KafkaParameters:
             valid_keys.update(ref_dict.keys())
         # Return a new dictionary with only valid keys
         return {k: v for k, v in input_dict.items() if k in valid_keys}
+def parse_url(url):
+    if url and url.startswith("v3io://") and not url.startswith("v3io:///"):
+        url = url.replace("v3io://", "v3io:///", 1)
+    parsed_url = urlparse(url)
+    schema = parsed_url.scheme.lower()
+    endpoint = parsed_url.hostname
+    if endpoint:
+        # HACK - urlparse returns the hostname after in lower case - we want the original case:
+        # the hostname is a substring of the netloc, in which it's the original case, so we find the indexes of the
+        # hostname in the netloc and take it from there
+        lower_hostname = parsed_url.hostname
+        netloc = str(parsed_url.netloc)
+        lower_netloc = netloc.lower()
+        hostname_index_in_netloc = lower_netloc.index(str(lower_hostname))
+        endpoint = netloc[
+            hostname_index_in_netloc : hostname_index_in_netloc + len(lower_hostname)
+        ]
+    if parsed_url.port:
+        endpoint += f":{parsed_url.port}"
+    return schema, endpoint, parsed_url

mlrun/datastore/v3io.py CHANGED Viewed

@@ -97,7 +97,7 @@ class V3ioStore(DataStore):
             v3io_access_key=self._get_secret_or_env("V3IO_ACCESS_KEY"),
             v3io_api=mlrun.mlconf.v3io_api,
         )
-        return self._sanitize_storage_options(res)
+        return self._sanitize_options(res)
     def _upload(
         self,

mlrun/db/base.py CHANGED Viewed

@@ -44,7 +44,7 @@ class RunDBInterface(ABC):
         pass
     @abstractmethod
-    def get_log(self, uid, project="", offset=0, size=0):
+    def get_log(self, uid, project="", offset=0, size=0, attempt=None):
         pass
     @abstractmethod

mlrun/db/httpdb.py CHANGED Viewed

@@ -608,7 +608,7 @@ class HTTPRunDB(RunDBInterface):
         error = f"store log {project}/{uid}"
         self.api_call("POST", path, error, params, body)
-    def get_log(self, uid, project="", offset=0, size=None):
+    def get_log(self, uid, project="", offset=0, size=None, attempt=None):
         """Retrieve 1 MB data of log.
         :param uid: Log unique ID
@@ -616,6 +616,8 @@ class HTTPRunDB(RunDBInterface):
         :param offset: Retrieve partial log, get up to ``size`` bytes starting at offset ``offset``
             from beginning of log (must be >= 0)
         :param size: If set to ``-1`` will retrieve and print all data to end of the log by chunks of 1MB each.
+        :param attempt: For retriable runs, the attempt number to retrieve the log for.
+            1 is the initial attempt.
         :returns: The following objects:
             - state - The state of the runtime object which generates this log, if it exists. In case no known state
@@ -636,6 +638,8 @@ class HTTPRunDB(RunDBInterface):
             return state, offset
         params = {"offset": offset, "size": size}
+        if attempt:
+            params["attempt"] = attempt
         path = self._path_of("logs", project, uid)
         error = f"get log {project}/{uid}"
         resp = self.api_call("GET", path, error, params=params)
@@ -658,7 +662,7 @@ class HTTPRunDB(RunDBInterface):
         resp = self.api_call("GET", path, error)
         return resp.json()["size"]
-    def watch_log(self, uid, project="", watch=True, offset=0):
+    def watch_log(self, uid, project="", watch=True, offset=0, attempt=None):
         """Retrieve logs of a running process by chunks of 1MB, and watch the progress of the execution until it
         completes. This method will print out the logs and continue to periodically poll for, and print,
         new logs as long as the state of the runtime which generates this log is either ``pending`` or ``running``.
@@ -668,10 +672,11 @@ class HTTPRunDB(RunDBInterface):
         :param watch: If set to ``True`` will continue tracking the log as described above. Otherwise this function
             is practically equivalent to the :py:func:`~get_log` function.
         :param offset: Minimal offset in the log to watch.
+        :param attempt: For retriable runs, the attempt number to retrieve the log for. 1 is the initial attempt.
         :returns: The final state of the log being watched and the final offset.
         """
-        state, text = self.get_log(uid, project, offset=offset)
+        state, text = self.get_log(uid, project, offset=offset, attempt=attempt)
         if text:
             print(text.decode(errors=mlrun.mlconf.httpdb.logs.decode.errors))
         nil_resp = 0
@@ -687,7 +692,7 @@ class HTTPRunDB(RunDBInterface):
                         mlrun.mlconf.httpdb.logs.pull_logs_backoff_no_logs_default_interval
                     )
                 )
-            state, text = self.get_log(uid, project, offset=offset)
+            state, text = self.get_log(uid, project, offset=offset, attempt=attempt)
             if text:
                 nil_resp = 0
                 print(

mlrun/db/nopdb.py CHANGED Viewed

@@ -63,7 +63,7 @@ class NopDB(RunDBInterface):
     def store_log(self, uid, project="", body=None, append=False):
         pass
-    def get_log(self, uid, project="", offset=0, size=0):
+    def get_log(self, uid, project="", offset=0, size=0, attempt=None):
         pass
     def store_run(self, struct, uid, project="", iter=0):

mlrun/execution.py CHANGED Viewed

@@ -26,6 +26,7 @@ from dateutil import parser
 import mlrun
 import mlrun.common.constants as mlrun_constants
 import mlrun.common.formatters
+import mlrun.common.runtimes.constants
 from mlrun.artifacts import (
     Artifact,
     DatasetArtifact,
@@ -91,6 +92,8 @@ class MLClientCtx:
         self._autocommit = autocommit
         self._notifications = []
         self._state_thresholds = {}
+        self._retry_spec = {}
+        self._retry_count = None
         self._labels = {}
         self._annotations = {}
@@ -432,6 +435,7 @@ class MLClientCtx:
             self._tolerations = spec.get("tolerations", self._tolerations)
             self._affinity = spec.get("affinity", self._affinity)
             self._reset_on_run = spec.get("reset_on_run", self._reset_on_run)
+            self._retry_spec = spec.get("retry", self._retry_spec)
         self._init_dbs(rundb)
@@ -450,10 +454,11 @@ class MLClientCtx:
         if start:
             start = parser.parse(start) if isinstance(start, str) else start
             self._start_time = start
-        self._state = "running"
+        self._state = mlrun.common.runtimes.constants.RunStates.running
         status = attrs.get("status")
-        if include_status and status:
+        retry_configured = self._retry_spec and self._retry_spec.get("count")
+        if (include_status or retry_configured) and status:
             self._results = status.get("results", self._results)
             for artifact in status.get("artifacts", []):
                 artifact_obj = dict_to_artifact(artifact)
@@ -462,7 +467,10 @@ class MLClientCtx:
                 )
             for key, uri in status.get("artifact_uris", {}).items():
                 self._artifacts_manager.artifact_uris[key] = uri
-            self._state = status.get("state", self._state)
+            self._retry_count = status.get("retry_count", self._retry_count)
+            # if run is a retry, the state needs to move to running
+            if include_status:
+                self._state = status.get("state", self._state)
         # No need to store the run for every worker
         if store_run and self.is_logging_worker():
@@ -1107,13 +1115,13 @@ class MLClientCtx:
         :param completed: Mark run as completed
         """
         # Changing state to completed is allowed only when the execution is in running state
-        if self._state != "running":
+        if self._state != mlrun.common.runtimes.constants.RunStates.running:
             completed = False
         if message:
             self._annotations["message"] = message
         if completed:
-            self._state = "completed"
+            self._state = mlrun.common.runtimes.constants.RunStates.completed
         if self._parent:
             self._parent.update_child_iterations()
@@ -1147,9 +1155,15 @@ class MLClientCtx:
         updates = {"status.last_update": now_date().isoformat()}
         if error is not None:
-            self._state = "error"
+            state = mlrun.common.runtimes.constants.RunStates.error
+            max_retries = self._retry_spec.get("count", 0)
+            self._retry_count = self._retry_count or 0
+            if max_retries and self._retry_count < max_retries:
+                state = mlrun.common.runtimes.constants.RunStates.pending_retry
+            self._state = state
             self._error = str(error)
-            updates["status.state"] = "error"
+            updates["status.state"] = state
             updates["status.error"] = error
         elif (
             execution_state
@@ -1241,11 +1255,13 @@ class MLClientCtx:
                 "node_selector": self._node_selector,
                 "tolerations": self._tolerations,
                 "affinity": self._affinity,
+                "retry": self._retry_spec,
             },
             "status": {
                 "results": self._results,
                 "start_time": to_date_str(self._start_time),
                 "last_update": to_date_str(self._last_update),
+                "retry_count": self._retry_count,
             },
         }

mlrun/launcher/base.py CHANGED Viewed

@@ -18,6 +18,8 @@ import os
 import uuid
 from typing import Any, Callable, Optional, Union
+import mlrun.common.constants
+import mlrun.common.runtimes.constants
 import mlrun.common.schemas
 import mlrun.config
 import mlrun.errors
@@ -72,6 +74,7 @@ class BaseLauncher(abc.ABC):
         notifications: Optional[list[mlrun.model.Notification]] = None,
         returns: Optional[list[Union[str, dict[str, str]]]] = None,
         state_thresholds: Optional[dict[str, int]] = None,
+        retry: Optional[Union[mlrun.model.Retry, dict]] = None,
     ) -> "mlrun.run.RunObject":
         """run the function from the server/client[local/remote]"""
         pass
@@ -133,7 +136,7 @@ class BaseLauncher(abc.ABC):
         """Check if the runtime requires to build the image and updates the spec accordingly"""
         pass
-    def _validate_runtime(
+    def _validate_run(
         self,
         runtime: "mlrun.runtimes.BaseRuntime",
         run: "mlrun.run.RunObject",
@@ -194,7 +197,7 @@ class BaseLauncher(abc.ABC):
             )
     @classmethod
-    def _validate_run_single_param(cls, param_name, param_value):
+    def _validate_run_single_param(cls, param_name: str, param_value: int):
         # verify that integer parameters don't exceed a int64
         if isinstance(param_value, int) and abs(param_value) >= 2**63:
             raise mlrun.errors.MLRunInvalidArgumentError(
@@ -203,8 +206,6 @@ class BaseLauncher(abc.ABC):
     @staticmethod
     def _create_run_object(task):
-        valid_task_types = (dict, mlrun.run.RunTemplate, mlrun.run.RunObject)
         if not task:
             # if task passed generate default RunObject
             return mlrun.run.RunObject.from_dict(task)
@@ -215,18 +216,18 @@ class BaseLauncher(abc.ABC):
         if isinstance(task, str):
             task = ast.literal_eval(task)
-        if not isinstance(task, valid_task_types):
-            raise mlrun.errors.MLRunInvalidArgumentError(
-                f"Task is not a valid object, type={type(task)}, expected types={valid_task_types}"
-            )
+        valid_task_types = (dict, mlrun.run.RunTemplate, mlrun.run.RunObject)
+        if isinstance(task, mlrun.run.RunObject):
+            # if task is already a RunObject, we can return it as is
+            return task
         if isinstance(task, mlrun.run.RunTemplate):
             return mlrun.run.RunObject.from_template(task)
         elif isinstance(task, dict):
             return mlrun.run.RunObject.from_dict(task)
-        # task is already a RunObject
-        return task
+        raise mlrun.errors.MLRunInvalidArgumentError(
+            f"Task is not a valid object, type={type(task)}, expected types={valid_task_types}"
+        )
     @staticmethod
     def _enrich_run(
@@ -246,6 +247,7 @@ class BaseLauncher(abc.ABC):
         workdir=None,
         notifications: Optional[list[mlrun.model.Notification]] = None,
         state_thresholds: Optional[dict[str, int]] = None,
+        retry: Optional[Union[mlrun.model.Retry, dict]] = None,
     ):
         run.spec.handler = (
             handler or run.spec.handler or runtime.spec.default_handler or ""
@@ -364,6 +366,7 @@ class BaseLauncher(abc.ABC):
             | state_thresholds
         )
         run.spec.state_thresholds = state_thresholds or run.spec.state_thresholds
+        run.spec.retry = retry or run.spec.retry
         return run
     @staticmethod
@@ -410,7 +413,7 @@ class BaseLauncher(abc.ABC):
             )
             if (
                 run.status.state
-                in mlrun.common.runtimes.constants.RunStates.error_and_abortion_states()
+                in mlrun.common.runtimes.constants.RunStates.error_states()
             ):
                 if runtime._is_remote and not runtime.is_child:
                     logger.error(
@@ -418,7 +421,14 @@ class BaseLauncher(abc.ABC):
                         state=run.status.state,
                         status=run.status.to_dict(),
                     )
-                raise mlrun.runtimes.utils.RunError(run.error)
+                error = run.error
+                if (
+                    run.status.state
+                    == mlrun.common.runtimes.constants.RunStates.pending_retry
+                ):
+                    error = f"Run is pending retry, error: {run.error}"
+                raise mlrun.runtimes.utils.RunError(error)
             return run
         return None

mlrun/launcher/local.py CHANGED Viewed

@@ -72,6 +72,7 @@ class ClientLocalLauncher(launcher.ClientBaseLauncher):
         returns: Optional[list[Union[str, dict[str, str]]]] = None,
         state_thresholds: Optional[dict[str, int]] = None,
         reset_on_run: Optional[bool] = None,
+        retry: Optional[Union[mlrun.model.Retry, dict]] = None,
     ) -> "mlrun.run.RunObject":
         # do not allow local function to be scheduled
         if schedule is not None:
@@ -122,8 +123,9 @@ class ClientLocalLauncher(launcher.ClientBaseLauncher):
             workdir=workdir,
             notifications=notifications,
             state_thresholds=state_thresholds,
+            retry=retry,
         )
-        self._validate_runtime(runtime, run)
+        self._validate_run(runtime, run)
         result = self._execute(
             runtime=runtime,
             run=run,

mlrun/launcher/remote.py CHANGED Viewed

@@ -61,6 +61,7 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
         returns: Optional[list[Union[str, dict[str, str]]]] = None,
         state_thresholds: Optional[dict[str, int]] = None,
         reset_on_run: Optional[bool] = None,
+        retry: Optional[Union[mlrun.model.Retry, dict]] = None,
     ) -> "mlrun.run.RunObject":
         self.enrich_runtime(runtime, project)
         run = self._create_run_object(task)
@@ -82,8 +83,9 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
             workdir=workdir,
             notifications=notifications,
             state_thresholds=state_thresholds,
+            retry=retry,
         )
-        self._validate_runtime(runtime, run)
+        self._validate_run(runtime, run)
         if not runtime.is_deployed():
             if runtime.spec.build.auto_build or auto_build:
@@ -190,7 +192,7 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
         return self._wrap_run_result(runtime, resp, run, schedule=schedule)
     @classmethod
-    def _validate_run_single_param(cls, param_name, param_value):
+    def _validate_run_single_param(cls, param_name: str, param_value: int):
         if isinstance(param_value, pd.DataFrame):
             raise mlrun.errors.MLRunInvalidArgumentTypeError(
                 f"Parameter '{param_name}' has an unsupported value of type"

mlrun 1.10.0rc11__py3-none-any.whl → 1.10.0rc12__py3-none-any.whl

Potentially problematic release.

mlrun 1.10.0rc11py3-none-any.whl → 1.10.0rc12py3-none-any.whl