PyPI - mlrun - Versions diffs - 1.10.0rc11__py3-none-any.whl → 1.10.0rc13__py3-none-any.whl - Mend

mlrun 1.10.0rc11py3-none-any.whl → 1.10.0rc13py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of mlrun might be problematic. Click here for more details.

Files changed (59) hide show

mlrun/__init__.py +2 -1
mlrun/__main__.py +7 -1
mlrun/artifacts/base.py +9 -3
mlrun/artifacts/dataset.py +2 -1
mlrun/artifacts/llm_prompt.py +6 -2
mlrun/artifacts/model.py +2 -2
mlrun/common/constants.py +1 -0
mlrun/common/runtimes/constants.py +10 -1
mlrun/common/schemas/__init__.py +1 -1
mlrun/common/schemas/model_monitoring/model_endpoints.py +1 -1
mlrun/common/schemas/serving.py +7 -0
mlrun/config.py +21 -2
mlrun/datastore/__init__.py +3 -1
mlrun/datastore/alibaba_oss.py +1 -1
mlrun/datastore/azure_blob.py +1 -1
mlrun/datastore/base.py +6 -31
mlrun/datastore/datastore.py +109 -33
mlrun/datastore/datastore_profile.py +31 -0
mlrun/datastore/dbfs_store.py +1 -1
mlrun/datastore/google_cloud_storage.py +2 -2
mlrun/datastore/model_provider/__init__.py +13 -0
mlrun/datastore/model_provider/model_provider.py +160 -0
mlrun/datastore/model_provider/openai_provider.py +144 -0
mlrun/datastore/remote_client.py +65 -0
mlrun/datastore/s3.py +1 -1
mlrun/datastore/storeytargets.py +1 -1
mlrun/datastore/utils.py +22 -0
mlrun/datastore/v3io.py +1 -1
mlrun/db/base.py +1 -1
mlrun/db/httpdb.py +9 -4
mlrun/db/nopdb.py +1 -1
mlrun/execution.py +28 -7
mlrun/launcher/base.py +23 -13
mlrun/launcher/local.py +3 -1
mlrun/launcher/remote.py +4 -2
mlrun/model.py +65 -0
mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +175 -8
mlrun/package/packagers_manager.py +2 -0
mlrun/projects/operations.py +8 -1
mlrun/projects/pipelines.py +40 -18
mlrun/projects/project.py +28 -5
mlrun/run.py +42 -2
mlrun/runtimes/__init__.py +6 -0
mlrun/runtimes/base.py +24 -6
mlrun/runtimes/daskjob.py +1 -0
mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
mlrun/runtimes/local.py +1 -6
mlrun/serving/server.py +1 -2
mlrun/serving/states.py +438 -23
mlrun/serving/system_steps.py +27 -29
mlrun/utils/helpers.py +13 -2
mlrun/utils/notifications/notification_pusher.py +15 -0
mlrun/utils/version/version.json +2 -2
{mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/METADATA +2 -2
{mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/RECORD +59 -55
{mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/WHEEL +0 -0
{mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/entry_points.txt +0 -0
{mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/licenses/LICENSE +0 -0
{mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/top_level.txt +0 -0

mlrun/projects/pipelines.py CHANGED Viewed

@@ -1081,34 +1081,56 @@ def rerun_workflow(
     :param run_uid:      The run UID of the original workflow to retry.
     :param project_name: The project name.
     """
+    db = mlrun.get_run_db()
     try:
-        # TODO in followups: handle start and running notifications
-        # Retry the pipeline  - TODO: add submit-direct flag when created
-        db = mlrun.get_run_db()
+        # Invoke the KFP retry endpoint (direct-submit mode)
         new_pipeline_id = db.retry_pipeline(
-            run_uid, project_name, submit_mode=mlrun_constants.WorkflowSubmitMode.direct
+            run_id=run_uid,
+            project=project_name,
+            submit_mode=mlrun_constants.WorkflowSubmitMode.direct,
+        )
+        logger.info(
+            "KFP retry submitted",
+            new_pipeline_id=new_pipeline_id,
+            rerun_of_workflow=run_uid,
         )
-        # Store result for observability
-        context.set_label(
-            mlrun_constants.MLRunInternalLabels.workflow_id, new_pipeline_id
+    except mlrun.errors.MLRunHTTPError as http_exc:
+        logger.error(
+            "Failed calling KFP retry API",
+            run_id=run_uid,
+            error=err_to_str(http_exc),
         )
-        context.update_run()
+        raise
-        context.log_result("workflow_id", new_pipeline_id)
+    # Enqueue "running" notifications server-side for this RerunRunner run
+    db.push_run_notifications(context.uid, project_name)
-        # wait for pipeline completion so monitor will push terminal notifications
-        wait_for_pipeline_completion(
+    context.set_label(mlrun_constants.MLRunInternalLabels.workflow_id, new_pipeline_id)
+    context.update_run()
+    context.log_result("workflow_id", new_pipeline_id)
+    try:
+        pipeline = wait_for_pipeline_completion(
             new_pipeline_id,
             project=project_name,
         )
-    # Temporary exception
     except Exception as exc:
-        context.logger.error("Failed to rerun workflow", exc=err_to_str(exc))
-        raise
+        mlrun.utils.logger.error(
+            "Failed waiting for workflow completion",
+            rerun_pipeline_id=new_pipeline_id,
+            exc=err_to_str(exc),
+        )
+    else:
+        final_state = pipeline["run"]["status"]
+        context.log_result("workflow_state", final_state, commit=True)
+        if final_state != mlrun_pipelines.common.models.RunStatuses.succeeded:
+            raise mlrun.errors.MLRunRuntimeError(
+                f"Pipeline retry of {run_uid} finished in state={final_state}"
+            )
 def load_and_run(context, *args, **kwargs):
@@ -1201,13 +1223,13 @@ def load_and_run_workflow(
     start_notifications = [
         notification
         for notification in context.get_notifications(unmask_secret_params=True)
-        if "running" in notification.when
+        if mlrun.common.runtimes.constants.RunStates.running in notification.when
     ]
     # Prevent redundant notifications for run completion by ensuring that notifications are only triggered when the run
     # reaches the "running" state, as the server already handles the completion notifications.
     for notification in start_notifications:
-        notification.when = ["running"]
+        notification.when = [mlrun.common.runtimes.constants.RunStates.running]
     workflow_log_message = workflow_name or workflow_path
     context.logger.info(

mlrun/projects/project.py CHANGED Viewed

@@ -159,7 +159,8 @@ def new_project(
     parameters: Optional[dict] = None,
     default_function_node_selector: Optional[dict] = None,
 ) -> "MlrunProject":
-    """Create a new MLRun project, optionally load it from a yaml/zip/git template
+    """Create a new MLRun project, optionally load it from a yaml/zip/git template.
+    The project will become the active project for the current session.
     A new project is created and returned, you can customize the project by placing a project_setup.py file
     in the project root dir, it will be executed upon project creation or loading.
@@ -326,7 +327,8 @@ def load_project(
     parameters: Optional[dict] = None,
     allow_cross_project: Optional[bool] = None,
 ) -> "MlrunProject":
-    """Load an MLRun project from git or tar or dir
+    """Load an MLRun project from git or tar or dir. The project will become the active project for
+    the current session.
     MLRun looks for a project.yaml file with project definition and objects in the project root path
     and use it to initialize the project, in addition it runs the project_setup.py file (if it exists)
@@ -1940,6 +1942,11 @@ class MlrunProject(ModelObj):
         :returns: The logged `LLMPromptArtifact` object.
         """
+        if not prompt_string and not prompt_path:
+            raise mlrun.errors.MLRunInvalidArgumentError(
+                "Either 'prompt_string' or 'prompt_path' must be provided"
+            )
         llm_prompt = LLMPromptArtifact(
             key=key,
             project=self.name,
@@ -2688,8 +2695,8 @@ class MlrunProject(ModelObj):
         requirements_file: str = "",
     ) -> mlrun.runtimes.BaseRuntime:
         """
-        | Update or add a function object to the project.
-        | Function can be provided as an object (func) or a .py/.ipynb/.yaml URL.
+        Update or add a function object to the project.
+        Function can be provided as an object (func) or a .py/.ipynb/.yaml URL.
         | Creating a function from a single file is done by specifying ``func`` and disabling ``with_repo``.
         | Creating a function with project source (specify ``with_repo=True``):
@@ -2734,6 +2741,20 @@ class MlrunProject(ModelObj):
             # By providing a path to a pip requirements file
             proj.set_function("my.py", requirements="requirements.txt")
+        One of the most important parameters is 'kind', used to specify the chosen runtime. The options are:
+           - local: execute a local python or shell script
+           - job: insert the code into a Kubernetes pod and execute it
+           - nuclio: insert the code into a real-time serverless nuclio function
+           - serving: insert code into orchestrated nuclio function(s) forming a DAG
+           - dask: run the specified python code / script as Dask Distributed job
+           - mpijob: run distributed Horovod jobs over the MPI job operator
+           - spark: run distributed Spark job using Spark Kubernetes Operator
+           - remote-spark: run distributed Spark job on remote Spark service
+           - databricks: run code on Databricks cluster (python scripts, Spark etc.)
+           - application: run a long living application (e.g. a web server, UI, etc.)
+        Learn more about :doc:`../../concepts/functions-overview`.
         :param func:                Function object or spec/code url, None refers to current Notebook
         :param name:                Name of the function (under the project), can be specified with a tag to support
                                     Versions (e.g. myfunc:v1). If the `tag` parameter is provided, the tag in the name
@@ -3967,6 +3988,7 @@ class MlrunProject(ModelObj):
         builder_env: Optional[dict] = None,
         reset_on_run: Optional[bool] = None,
         output_path: Optional[str] = None,
+        retry: Optional[Union[mlrun.model.Retry, dict]] = None,
     ) -> typing.Union[mlrun.model.RunObject, PipelineNodeWrapper]:
         """Run a local or remote task as part of a local/kubeflow pipeline
@@ -4029,7 +4051,7 @@ class MlrunProject(ModelObj):
                                 This ensures latest code changes are executed. This argument must be used in
                                 conjunction with the local=True argument.
         :param output_path:     path to store artifacts, when running in a workflow this will be set automatically
+        :param retry:           Retry configuration for the run, can be a dict or an instance of mlrun.model.Retry.
         :return: MLRun RunObject or PipelineNodeWrapper
         """
         if artifact_path:
@@ -4068,6 +4090,7 @@ class MlrunProject(ModelObj):
                 returns=returns,
                 builder_env=builder_env,
                 reset_on_run=reset_on_run,
+                retry=retry,
             )
     def build_function(

mlrun/run.py CHANGED Viewed

@@ -36,6 +36,7 @@ import mlrun.common.schemas
 import mlrun.errors
 import mlrun.utils.helpers
 import mlrun_pipelines.utils
+from mlrun.datastore.model_provider.model_provider import ModelProvider
 from mlrun_pipelines.common.models import RunStatuses
 from mlrun_pipelines.common.ops import format_summary_from_kfp_run, show_kfp_run
@@ -894,7 +895,7 @@ def _run_pipeline(
 def retry_pipeline(
     run_id: str,
     project: str,
-) -> str:
+) -> typing.Union[str, dict[str, str]]:
     """Retry a pipeline run.
     This function retries a previously executed pipeline run using the specified run ID. If the run is not in a
@@ -913,10 +914,33 @@ def retry_pipeline(
             "Please set the dbpath URL."
         )
-    pipeline_run_id = mldb.retry_pipeline(
+    # Invoke retry pipeline run. Depending on the context, this call returns either:
+    # 1. A simple string of a workflow-id, for direct retries or non-remote workflows, or
+    # 2. A dict payload representing a WorkflowResponse when rerunning remote workflows.
+    rerun_response = mldb.retry_pipeline(
         run_id=run_id,
         project=project,
     )
+    if isinstance(rerun_response, str):
+        pipeline_run_id = rerun_response
+    else:
+        rerun_response = mlrun.common.schemas.WorkflowResponse(**rerun_response)
+        def _fetch_workflow_id():
+            rerun = mldb.read_run(rerun_response.run_id, project)
+            workflow_id = rerun["metadata"]["labels"].get("workflow-id")
+            if not workflow_id:
+                raise mlrun.errors.MLRunRuntimeError("workflow-id label not set yet")
+            return workflow_id
+        pipeline_run_id = mlrun.utils.helpers.retry_until_successful(
+            backoff=3,
+            timeout=int(mlrun.mlconf.workflows.timeouts.remote),
+            logger=logger,
+            verbose=False,
+            _function=_fetch_workflow_id,
+        )
     if pipeline_run_id == run_id:
         logger.info(
             f"Retried pipeline run ID={pipeline_run_id}, check UI for progress."
@@ -1152,6 +1176,22 @@ def get_dataitem(url, secrets=None, db=None) -> "DataItem":
     return stores.object(url=url)
+def get_model_provider(
+    url,
+    secrets=None,
+    db=None,
+    default_invoke_kwargs: Optional[dict] = None,
+    raise_missing_schema_exception=True,
+) -> ModelProvider:
+    """get mlrun dataitem object (from path/url)"""
+    store_manager.set(secrets, db=db)
+    return store_manager.model_provider_object(
+        url=url,
+        default_invoke_kwargs=default_invoke_kwargs,
+        raise_missing_schema_exception=raise_missing_schema_exception,
+    )
 def download_object(url, target, secrets=None):
     """download mlrun dataitem (from path/url to target path)"""
     stores = store_manager.set(secrets)

mlrun/runtimes/__init__.py CHANGED Viewed

@@ -148,6 +148,12 @@ class RuntimeKinds:
             "",
         ]
+    @staticmethod
+    def retriable_runtimes():
+        return [
+            RuntimeKinds.job,
+        ]
     @staticmethod
     def nuclio_runtimes():
         return [

mlrun/runtimes/base.py CHANGED Viewed

@@ -33,6 +33,7 @@ import mlrun.launcher.factory
 import mlrun.utils.helpers
 import mlrun.utils.notifications
 import mlrun.utils.regex
+from mlrun.common.runtimes.constants import RunStates
 from mlrun.model import (
     BaseMetadata,
     HyperParamOptions,
@@ -319,6 +320,7 @@ class BaseRuntime(ModelObj):
         state_thresholds: Optional[dict[str, int]] = None,
         reset_on_run: Optional[bool] = None,
         output_path: Optional[str] = "",
+        retry: Optional[Union[mlrun.model.Retry, dict]] = None,
         **launcher_kwargs,
     ) -> RunObject:
         """
@@ -377,6 +379,7 @@ class BaseRuntime(ModelObj):
                              This ensures latest code changes are executed. This argument must be used in
                              conjunction with the local=True argument.
         :param output_path:    Default artifact output path.
+        :param retry:          Retry configuration for the run, can be a dict or an instance of mlrun.model.Retry.
         :return: Run context object (RunObject) with run metadata, results and status
         """
         if artifact_path or out_path:
@@ -414,6 +417,7 @@ class BaseRuntime(ModelObj):
             returns=returns,
             state_thresholds=state_thresholds,
             reset_on_run=reset_on_run,
+            retry=retry,
         )
     def _get_db_run(
@@ -570,12 +574,27 @@ class BaseRuntime(ModelObj):
         updates = None
         last_state = get_in(resp, "status.state", "")
         kind = get_in(resp, "metadata.labels.kind", "")
-        if last_state == "error" or err:
+        if last_state in RunStates.error_states() or err:
+            new_state = RunStates.error
+            status_text = None
+            max_retries = get_in(resp, "spec.retry.count", 0)
+            retry_count = get_in(resp, "status.retry_count", 0) or 0
+            attempts = retry_count + 1
+            if max_retries:
+                if retry_count < max_retries:
+                    new_state = RunStates.pending_retry
+                    status_text = f"Run failed attempt {attempts} of {max_retries + 1}"
+                elif retry_count >= max_retries:
+                    status_text = f"Run failed after {attempts} attempts"
             updates = {
                 "status.last_update": now_date().isoformat(),
-                "status.state": "error",
+                "status.state": new_state,
             }
-            update_in(resp, "status.state", "error")
+            update_in(resp, "status.state", new_state)
+            if status_text:
+                updates["status.status_text"] = status_text
+                update_in(resp, "status.status_text", status_text)
             if err:
                 update_in(resp, "status.error", err_to_str(err))
             err = get_in(resp, "status.error")
@@ -584,9 +603,8 @@ class BaseRuntime(ModelObj):
         elif (
             not was_none
-            and last_state != mlrun.common.runtimes.constants.RunStates.completed
-            and last_state
-            not in mlrun.common.runtimes.constants.RunStates.error_and_abortion_states()
+            and last_state != RunStates.completed
+            and last_state not in RunStates.error_and_abortion_states()
         ):
             try:
                 runtime_cls = mlrun.runtimes.get_runtime_class(kind)

mlrun/runtimes/daskjob.py CHANGED Viewed

@@ -505,6 +505,7 @@ class DaskCluster(KubejobRuntime):
         state_thresholds: Optional[dict[str, int]] = None,
         reset_on_run: Optional[bool] = None,
         output_path: Optional[str] = "",
+        retry: Optional[Union[mlrun.model.Retry, dict]] = None,
         **launcher_kwargs,
     ) -> RunObject:
         if state_thresholds:

mlrun/runtimes/databricks_job/databricks_runtime.py CHANGED Viewed

@@ -233,6 +233,7 @@ def run_mlrun_databricks_job(context,task_parameters: dict, **kwargs):
         state_thresholds: Optional[dict[str, int]] = None,
         reset_on_run: Optional[bool] = None,
         output_path: Optional[str] = "",
+        retry: Optional[Union[mlrun.model.Retry, dict]] = None,
         **launcher_kwargs,
     ) -> RunObject:
         if local:

mlrun/runtimes/local.py CHANGED Viewed

@@ -34,6 +34,7 @@ from nuclio import Event
 import mlrun
 import mlrun.common.constants as mlrun_constants
+import mlrun.common.runtimes.constants
 from mlrun.lists import RunList
 from ..errors import err_to_str
@@ -315,15 +316,9 @@ class LocalRuntime(BaseRuntime, ParallelRunner):
                 return context.to_dict()
             # if RunError was raised it means that the error was raised as part of running the function
-            # ( meaning the state was already updated to error ) therefore we just re-raise the error
             except RunError as err:
                 raise err
-            # this exception handling is for the case where we fail on pre-loading or post-running the function
-            # and the state was not updated to error yet, therefore we update the state to error and raise as RunError
             except Exception as exc:
-                # set_state here is mainly for sanity, as we will raise RunError which is expected to be handled
-                # by the caller and will set the state to error ( in `update_run_state` )
-                context.set_state(error=err_to_str(exc), commit=True)
                 logger.error(f"Run error, {traceback.format_exc()}")
                 raise RunError(
                     "Failed on pre-loading / post-running of the function"

mlrun/serving/server.py CHANGED Viewed

@@ -395,7 +395,6 @@ def add_monitoring_general_steps(
     monitor_flow_step = graph.add_step(
         "mlrun.serving.system_steps.BackgroundTaskStatus",
         "background_task_status_step",
-        context=context,
         model_endpoint_creation_strategy=mlrun.common.schemas.ModelEndpointCreationStrategy.SKIP,
     )
     graph.add_step(
@@ -410,7 +409,6 @@ def add_monitoring_general_steps(
         "monitoring_pre_processor_step",
         after="filter_none",
         full_event=True,
-        context=context,
         model_endpoint_creation_strategy=mlrun.common.schemas.ModelEndpointCreationStrategy.SKIP,
     )
     # flatten the events
@@ -790,6 +788,7 @@ class GraphContext:
         self.verbose = False
         self.stream = None
         self.root = None
+        self.executor: Optional[storey.flow.RunnableExecutor] = None
         if nuclio_context:
             self.logger: NuclioLogger = nuclio_context.logger

mlrun 1.10.0rc11__py3-none-any.whl → 1.10.0rc13__py3-none-any.whl

Potentially problematic release.

mlrun 1.10.0rc11py3-none-any.whl → 1.10.0rc13py3-none-any.whl