mlrun 1.10.0rc11__py3-none-any.whl → 1.10.0rc13__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (59) hide show
  1. mlrun/__init__.py +2 -1
  2. mlrun/__main__.py +7 -1
  3. mlrun/artifacts/base.py +9 -3
  4. mlrun/artifacts/dataset.py +2 -1
  5. mlrun/artifacts/llm_prompt.py +6 -2
  6. mlrun/artifacts/model.py +2 -2
  7. mlrun/common/constants.py +1 -0
  8. mlrun/common/runtimes/constants.py +10 -1
  9. mlrun/common/schemas/__init__.py +1 -1
  10. mlrun/common/schemas/model_monitoring/model_endpoints.py +1 -1
  11. mlrun/common/schemas/serving.py +7 -0
  12. mlrun/config.py +21 -2
  13. mlrun/datastore/__init__.py +3 -1
  14. mlrun/datastore/alibaba_oss.py +1 -1
  15. mlrun/datastore/azure_blob.py +1 -1
  16. mlrun/datastore/base.py +6 -31
  17. mlrun/datastore/datastore.py +109 -33
  18. mlrun/datastore/datastore_profile.py +31 -0
  19. mlrun/datastore/dbfs_store.py +1 -1
  20. mlrun/datastore/google_cloud_storage.py +2 -2
  21. mlrun/datastore/model_provider/__init__.py +13 -0
  22. mlrun/datastore/model_provider/model_provider.py +160 -0
  23. mlrun/datastore/model_provider/openai_provider.py +144 -0
  24. mlrun/datastore/remote_client.py +65 -0
  25. mlrun/datastore/s3.py +1 -1
  26. mlrun/datastore/storeytargets.py +1 -1
  27. mlrun/datastore/utils.py +22 -0
  28. mlrun/datastore/v3io.py +1 -1
  29. mlrun/db/base.py +1 -1
  30. mlrun/db/httpdb.py +9 -4
  31. mlrun/db/nopdb.py +1 -1
  32. mlrun/execution.py +28 -7
  33. mlrun/launcher/base.py +23 -13
  34. mlrun/launcher/local.py +3 -1
  35. mlrun/launcher/remote.py +4 -2
  36. mlrun/model.py +65 -0
  37. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +175 -8
  38. mlrun/package/packagers_manager.py +2 -0
  39. mlrun/projects/operations.py +8 -1
  40. mlrun/projects/pipelines.py +40 -18
  41. mlrun/projects/project.py +28 -5
  42. mlrun/run.py +42 -2
  43. mlrun/runtimes/__init__.py +6 -0
  44. mlrun/runtimes/base.py +24 -6
  45. mlrun/runtimes/daskjob.py +1 -0
  46. mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
  47. mlrun/runtimes/local.py +1 -6
  48. mlrun/serving/server.py +1 -2
  49. mlrun/serving/states.py +438 -23
  50. mlrun/serving/system_steps.py +27 -29
  51. mlrun/utils/helpers.py +13 -2
  52. mlrun/utils/notifications/notification_pusher.py +15 -0
  53. mlrun/utils/version/version.json +2 -2
  54. {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/METADATA +2 -2
  55. {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/RECORD +59 -55
  56. {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/WHEEL +0 -0
  57. {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/entry_points.txt +0 -0
  58. {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/licenses/LICENSE +0 -0
  59. {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/top_level.txt +0 -0
@@ -1081,34 +1081,56 @@ def rerun_workflow(
1081
1081
  :param run_uid: The run UID of the original workflow to retry.
1082
1082
  :param project_name: The project name.
1083
1083
  """
1084
+ db = mlrun.get_run_db()
1084
1085
 
1085
1086
  try:
1086
- # TODO in followups: handle start and running notifications
1087
-
1088
- # Retry the pipeline - TODO: add submit-direct flag when created
1089
- db = mlrun.get_run_db()
1087
+ # Invoke the KFP retry endpoint (direct-submit mode)
1090
1088
  new_pipeline_id = db.retry_pipeline(
1091
- run_uid, project_name, submit_mode=mlrun_constants.WorkflowSubmitMode.direct
1089
+ run_id=run_uid,
1090
+ project=project_name,
1091
+ submit_mode=mlrun_constants.WorkflowSubmitMode.direct,
1092
+ )
1093
+ logger.info(
1094
+ "KFP retry submitted",
1095
+ new_pipeline_id=new_pipeline_id,
1096
+ rerun_of_workflow=run_uid,
1092
1097
  )
1093
1098
 
1094
- # Store result for observability
1095
- context.set_label(
1096
- mlrun_constants.MLRunInternalLabels.workflow_id, new_pipeline_id
1099
+ except mlrun.errors.MLRunHTTPError as http_exc:
1100
+ logger.error(
1101
+ "Failed calling KFP retry API",
1102
+ run_id=run_uid,
1103
+ error=err_to_str(http_exc),
1097
1104
  )
1098
- context.update_run()
1105
+ raise
1099
1106
 
1100
- context.log_result("workflow_id", new_pipeline_id)
1107
+ # Enqueue "running" notifications server-side for this RerunRunner run
1108
+ db.push_run_notifications(context.uid, project_name)
1101
1109
 
1102
- # wait for pipeline completion so monitor will push terminal notifications
1103
- wait_for_pipeline_completion(
1110
+ context.set_label(mlrun_constants.MLRunInternalLabels.workflow_id, new_pipeline_id)
1111
+ context.update_run()
1112
+
1113
+ context.log_result("workflow_id", new_pipeline_id)
1114
+
1115
+ try:
1116
+ pipeline = wait_for_pipeline_completion(
1104
1117
  new_pipeline_id,
1105
1118
  project=project_name,
1106
1119
  )
1107
-
1108
- # Temporary exception
1109
1120
  except Exception as exc:
1110
- context.logger.error("Failed to rerun workflow", exc=err_to_str(exc))
1111
- raise
1121
+ mlrun.utils.logger.error(
1122
+ "Failed waiting for workflow completion",
1123
+ rerun_pipeline_id=new_pipeline_id,
1124
+ exc=err_to_str(exc),
1125
+ )
1126
+ else:
1127
+ final_state = pipeline["run"]["status"]
1128
+ context.log_result("workflow_state", final_state, commit=True)
1129
+
1130
+ if final_state != mlrun_pipelines.common.models.RunStatuses.succeeded:
1131
+ raise mlrun.errors.MLRunRuntimeError(
1132
+ f"Pipeline retry of {run_uid} finished in state={final_state}"
1133
+ )
1112
1134
 
1113
1135
 
1114
1136
  def load_and_run(context, *args, **kwargs):
@@ -1201,13 +1223,13 @@ def load_and_run_workflow(
1201
1223
  start_notifications = [
1202
1224
  notification
1203
1225
  for notification in context.get_notifications(unmask_secret_params=True)
1204
- if "running" in notification.when
1226
+ if mlrun.common.runtimes.constants.RunStates.running in notification.when
1205
1227
  ]
1206
1228
 
1207
1229
  # Prevent redundant notifications for run completion by ensuring that notifications are only triggered when the run
1208
1230
  # reaches the "running" state, as the server already handles the completion notifications.
1209
1231
  for notification in start_notifications:
1210
- notification.when = ["running"]
1232
+ notification.when = [mlrun.common.runtimes.constants.RunStates.running]
1211
1233
 
1212
1234
  workflow_log_message = workflow_name or workflow_path
1213
1235
  context.logger.info(
mlrun/projects/project.py CHANGED
@@ -159,7 +159,8 @@ def new_project(
159
159
  parameters: Optional[dict] = None,
160
160
  default_function_node_selector: Optional[dict] = None,
161
161
  ) -> "MlrunProject":
162
- """Create a new MLRun project, optionally load it from a yaml/zip/git template
162
+ """Create a new MLRun project, optionally load it from a yaml/zip/git template.
163
+ The project will become the active project for the current session.
163
164
 
164
165
  A new project is created and returned, you can customize the project by placing a project_setup.py file
165
166
  in the project root dir, it will be executed upon project creation or loading.
@@ -326,7 +327,8 @@ def load_project(
326
327
  parameters: Optional[dict] = None,
327
328
  allow_cross_project: Optional[bool] = None,
328
329
  ) -> "MlrunProject":
329
- """Load an MLRun project from git or tar or dir
330
+ """Load an MLRun project from git or tar or dir. The project will become the active project for
331
+ the current session.
330
332
 
331
333
  MLRun looks for a project.yaml file with project definition and objects in the project root path
332
334
  and use it to initialize the project, in addition it runs the project_setup.py file (if it exists)
@@ -1940,6 +1942,11 @@ class MlrunProject(ModelObj):
1940
1942
  :returns: The logged `LLMPromptArtifact` object.
1941
1943
  """
1942
1944
 
1945
+ if not prompt_string and not prompt_path:
1946
+ raise mlrun.errors.MLRunInvalidArgumentError(
1947
+ "Either 'prompt_string' or 'prompt_path' must be provided"
1948
+ )
1949
+
1943
1950
  llm_prompt = LLMPromptArtifact(
1944
1951
  key=key,
1945
1952
  project=self.name,
@@ -2688,8 +2695,8 @@ class MlrunProject(ModelObj):
2688
2695
  requirements_file: str = "",
2689
2696
  ) -> mlrun.runtimes.BaseRuntime:
2690
2697
  """
2691
- | Update or add a function object to the project.
2692
- | Function can be provided as an object (func) or a .py/.ipynb/.yaml URL.
2698
+ Update or add a function object to the project.
2699
+ Function can be provided as an object (func) or a .py/.ipynb/.yaml URL.
2693
2700
 
2694
2701
  | Creating a function from a single file is done by specifying ``func`` and disabling ``with_repo``.
2695
2702
  | Creating a function with project source (specify ``with_repo=True``):
@@ -2734,6 +2741,20 @@ class MlrunProject(ModelObj):
2734
2741
  # By providing a path to a pip requirements file
2735
2742
  proj.set_function("my.py", requirements="requirements.txt")
2736
2743
 
2744
+ One of the most important parameters is 'kind', used to specify the chosen runtime. The options are:
2745
+ - local: execute a local python or shell script
2746
+ - job: insert the code into a Kubernetes pod and execute it
2747
+ - nuclio: insert the code into a real-time serverless nuclio function
2748
+ - serving: insert code into orchestrated nuclio function(s) forming a DAG
2749
+ - dask: run the specified python code / script as Dask Distributed job
2750
+ - mpijob: run distributed Horovod jobs over the MPI job operator
2751
+ - spark: run distributed Spark job using Spark Kubernetes Operator
2752
+ - remote-spark: run distributed Spark job on remote Spark service
2753
+ - databricks: run code on Databricks cluster (python scripts, Spark etc.)
2754
+ - application: run a long living application (e.g. a web server, UI, etc.)
2755
+
2756
+ Learn more about :doc:`../../concepts/functions-overview`.
2757
+
2737
2758
  :param func: Function object or spec/code url, None refers to current Notebook
2738
2759
  :param name: Name of the function (under the project), can be specified with a tag to support
2739
2760
  Versions (e.g. myfunc:v1). If the `tag` parameter is provided, the tag in the name
@@ -3967,6 +3988,7 @@ class MlrunProject(ModelObj):
3967
3988
  builder_env: Optional[dict] = None,
3968
3989
  reset_on_run: Optional[bool] = None,
3969
3990
  output_path: Optional[str] = None,
3991
+ retry: Optional[Union[mlrun.model.Retry, dict]] = None,
3970
3992
  ) -> typing.Union[mlrun.model.RunObject, PipelineNodeWrapper]:
3971
3993
  """Run a local or remote task as part of a local/kubeflow pipeline
3972
3994
 
@@ -4029,7 +4051,7 @@ class MlrunProject(ModelObj):
4029
4051
  This ensures latest code changes are executed. This argument must be used in
4030
4052
  conjunction with the local=True argument.
4031
4053
  :param output_path: path to store artifacts, when running in a workflow this will be set automatically
4032
-
4054
+ :param retry: Retry configuration for the run, can be a dict or an instance of mlrun.model.Retry.
4033
4055
  :return: MLRun RunObject or PipelineNodeWrapper
4034
4056
  """
4035
4057
  if artifact_path:
@@ -4068,6 +4090,7 @@ class MlrunProject(ModelObj):
4068
4090
  returns=returns,
4069
4091
  builder_env=builder_env,
4070
4092
  reset_on_run=reset_on_run,
4093
+ retry=retry,
4071
4094
  )
4072
4095
 
4073
4096
  def build_function(
mlrun/run.py CHANGED
@@ -36,6 +36,7 @@ import mlrun.common.schemas
36
36
  import mlrun.errors
37
37
  import mlrun.utils.helpers
38
38
  import mlrun_pipelines.utils
39
+ from mlrun.datastore.model_provider.model_provider import ModelProvider
39
40
  from mlrun_pipelines.common.models import RunStatuses
40
41
  from mlrun_pipelines.common.ops import format_summary_from_kfp_run, show_kfp_run
41
42
 
@@ -894,7 +895,7 @@ def _run_pipeline(
894
895
  def retry_pipeline(
895
896
  run_id: str,
896
897
  project: str,
897
- ) -> str:
898
+ ) -> typing.Union[str, dict[str, str]]:
898
899
  """Retry a pipeline run.
899
900
 
900
901
  This function retries a previously executed pipeline run using the specified run ID. If the run is not in a
@@ -913,10 +914,33 @@ def retry_pipeline(
913
914
  "Please set the dbpath URL."
914
915
  )
915
916
 
916
- pipeline_run_id = mldb.retry_pipeline(
917
+ # Invoke retry pipeline run. Depending on the context, this call returns either:
918
+ # 1. A simple string of a workflow-id, for direct retries or non-remote workflows, or
919
+ # 2. A dict payload representing a WorkflowResponse when rerunning remote workflows.
920
+ rerun_response = mldb.retry_pipeline(
917
921
  run_id=run_id,
918
922
  project=project,
919
923
  )
924
+ if isinstance(rerun_response, str):
925
+ pipeline_run_id = rerun_response
926
+ else:
927
+ rerun_response = mlrun.common.schemas.WorkflowResponse(**rerun_response)
928
+
929
+ def _fetch_workflow_id():
930
+ rerun = mldb.read_run(rerun_response.run_id, project)
931
+ workflow_id = rerun["metadata"]["labels"].get("workflow-id")
932
+ if not workflow_id:
933
+ raise mlrun.errors.MLRunRuntimeError("workflow-id label not set yet")
934
+ return workflow_id
935
+
936
+ pipeline_run_id = mlrun.utils.helpers.retry_until_successful(
937
+ backoff=3,
938
+ timeout=int(mlrun.mlconf.workflows.timeouts.remote),
939
+ logger=logger,
940
+ verbose=False,
941
+ _function=_fetch_workflow_id,
942
+ )
943
+
920
944
  if pipeline_run_id == run_id:
921
945
  logger.info(
922
946
  f"Retried pipeline run ID={pipeline_run_id}, check UI for progress."
@@ -1152,6 +1176,22 @@ def get_dataitem(url, secrets=None, db=None) -> "DataItem":
1152
1176
  return stores.object(url=url)
1153
1177
 
1154
1178
 
1179
+ def get_model_provider(
1180
+ url,
1181
+ secrets=None,
1182
+ db=None,
1183
+ default_invoke_kwargs: Optional[dict] = None,
1184
+ raise_missing_schema_exception=True,
1185
+ ) -> ModelProvider:
1186
+ """get mlrun dataitem object (from path/url)"""
1187
+ store_manager.set(secrets, db=db)
1188
+ return store_manager.model_provider_object(
1189
+ url=url,
1190
+ default_invoke_kwargs=default_invoke_kwargs,
1191
+ raise_missing_schema_exception=raise_missing_schema_exception,
1192
+ )
1193
+
1194
+
1155
1195
  def download_object(url, target, secrets=None):
1156
1196
  """download mlrun dataitem (from path/url to target path)"""
1157
1197
  stores = store_manager.set(secrets)
@@ -148,6 +148,12 @@ class RuntimeKinds:
148
148
  "",
149
149
  ]
150
150
 
151
+ @staticmethod
152
+ def retriable_runtimes():
153
+ return [
154
+ RuntimeKinds.job,
155
+ ]
156
+
151
157
  @staticmethod
152
158
  def nuclio_runtimes():
153
159
  return [
mlrun/runtimes/base.py CHANGED
@@ -33,6 +33,7 @@ import mlrun.launcher.factory
33
33
  import mlrun.utils.helpers
34
34
  import mlrun.utils.notifications
35
35
  import mlrun.utils.regex
36
+ from mlrun.common.runtimes.constants import RunStates
36
37
  from mlrun.model import (
37
38
  BaseMetadata,
38
39
  HyperParamOptions,
@@ -319,6 +320,7 @@ class BaseRuntime(ModelObj):
319
320
  state_thresholds: Optional[dict[str, int]] = None,
320
321
  reset_on_run: Optional[bool] = None,
321
322
  output_path: Optional[str] = "",
323
+ retry: Optional[Union[mlrun.model.Retry, dict]] = None,
322
324
  **launcher_kwargs,
323
325
  ) -> RunObject:
324
326
  """
@@ -377,6 +379,7 @@ class BaseRuntime(ModelObj):
377
379
  This ensures latest code changes are executed. This argument must be used in
378
380
  conjunction with the local=True argument.
379
381
  :param output_path: Default artifact output path.
382
+ :param retry: Retry configuration for the run, can be a dict or an instance of mlrun.model.Retry.
380
383
  :return: Run context object (RunObject) with run metadata, results and status
381
384
  """
382
385
  if artifact_path or out_path:
@@ -414,6 +417,7 @@ class BaseRuntime(ModelObj):
414
417
  returns=returns,
415
418
  state_thresholds=state_thresholds,
416
419
  reset_on_run=reset_on_run,
420
+ retry=retry,
417
421
  )
418
422
 
419
423
  def _get_db_run(
@@ -570,12 +574,27 @@ class BaseRuntime(ModelObj):
570
574
  updates = None
571
575
  last_state = get_in(resp, "status.state", "")
572
576
  kind = get_in(resp, "metadata.labels.kind", "")
573
- if last_state == "error" or err:
577
+ if last_state in RunStates.error_states() or err:
578
+ new_state = RunStates.error
579
+ status_text = None
580
+ max_retries = get_in(resp, "spec.retry.count", 0)
581
+ retry_count = get_in(resp, "status.retry_count", 0) or 0
582
+ attempts = retry_count + 1
583
+ if max_retries:
584
+ if retry_count < max_retries:
585
+ new_state = RunStates.pending_retry
586
+ status_text = f"Run failed attempt {attempts} of {max_retries + 1}"
587
+ elif retry_count >= max_retries:
588
+ status_text = f"Run failed after {attempts} attempts"
589
+
574
590
  updates = {
575
591
  "status.last_update": now_date().isoformat(),
576
- "status.state": "error",
592
+ "status.state": new_state,
577
593
  }
578
- update_in(resp, "status.state", "error")
594
+ update_in(resp, "status.state", new_state)
595
+ if status_text:
596
+ updates["status.status_text"] = status_text
597
+ update_in(resp, "status.status_text", status_text)
579
598
  if err:
580
599
  update_in(resp, "status.error", err_to_str(err))
581
600
  err = get_in(resp, "status.error")
@@ -584,9 +603,8 @@ class BaseRuntime(ModelObj):
584
603
 
585
604
  elif (
586
605
  not was_none
587
- and last_state != mlrun.common.runtimes.constants.RunStates.completed
588
- and last_state
589
- not in mlrun.common.runtimes.constants.RunStates.error_and_abortion_states()
606
+ and last_state != RunStates.completed
607
+ and last_state not in RunStates.error_and_abortion_states()
590
608
  ):
591
609
  try:
592
610
  runtime_cls = mlrun.runtimes.get_runtime_class(kind)
mlrun/runtimes/daskjob.py CHANGED
@@ -505,6 +505,7 @@ class DaskCluster(KubejobRuntime):
505
505
  state_thresholds: Optional[dict[str, int]] = None,
506
506
  reset_on_run: Optional[bool] = None,
507
507
  output_path: Optional[str] = "",
508
+ retry: Optional[Union[mlrun.model.Retry, dict]] = None,
508
509
  **launcher_kwargs,
509
510
  ) -> RunObject:
510
511
  if state_thresholds:
@@ -233,6 +233,7 @@ def run_mlrun_databricks_job(context,task_parameters: dict, **kwargs):
233
233
  state_thresholds: Optional[dict[str, int]] = None,
234
234
  reset_on_run: Optional[bool] = None,
235
235
  output_path: Optional[str] = "",
236
+ retry: Optional[Union[mlrun.model.Retry, dict]] = None,
236
237
  **launcher_kwargs,
237
238
  ) -> RunObject:
238
239
  if local:
mlrun/runtimes/local.py CHANGED
@@ -34,6 +34,7 @@ from nuclio import Event
34
34
 
35
35
  import mlrun
36
36
  import mlrun.common.constants as mlrun_constants
37
+ import mlrun.common.runtimes.constants
37
38
  from mlrun.lists import RunList
38
39
 
39
40
  from ..errors import err_to_str
@@ -315,15 +316,9 @@ class LocalRuntime(BaseRuntime, ParallelRunner):
315
316
  return context.to_dict()
316
317
 
317
318
  # if RunError was raised it means that the error was raised as part of running the function
318
- # ( meaning the state was already updated to error ) therefore we just re-raise the error
319
319
  except RunError as err:
320
320
  raise err
321
- # this exception handling is for the case where we fail on pre-loading or post-running the function
322
- # and the state was not updated to error yet, therefore we update the state to error and raise as RunError
323
321
  except Exception as exc:
324
- # set_state here is mainly for sanity, as we will raise RunError which is expected to be handled
325
- # by the caller and will set the state to error ( in `update_run_state` )
326
- context.set_state(error=err_to_str(exc), commit=True)
327
322
  logger.error(f"Run error, {traceback.format_exc()}")
328
323
  raise RunError(
329
324
  "Failed on pre-loading / post-running of the function"
mlrun/serving/server.py CHANGED
@@ -395,7 +395,6 @@ def add_monitoring_general_steps(
395
395
  monitor_flow_step = graph.add_step(
396
396
  "mlrun.serving.system_steps.BackgroundTaskStatus",
397
397
  "background_task_status_step",
398
- context=context,
399
398
  model_endpoint_creation_strategy=mlrun.common.schemas.ModelEndpointCreationStrategy.SKIP,
400
399
  )
401
400
  graph.add_step(
@@ -410,7 +409,6 @@ def add_monitoring_general_steps(
410
409
  "monitoring_pre_processor_step",
411
410
  after="filter_none",
412
411
  full_event=True,
413
- context=context,
414
412
  model_endpoint_creation_strategy=mlrun.common.schemas.ModelEndpointCreationStrategy.SKIP,
415
413
  )
416
414
  # flatten the events
@@ -790,6 +788,7 @@ class GraphContext:
790
788
  self.verbose = False
791
789
  self.stream = None
792
790
  self.root = None
791
+ self.executor: Optional[storey.flow.RunnableExecutor] = None
793
792
 
794
793
  if nuclio_context:
795
794
  self.logger: NuclioLogger = nuclio_context.logger