mlrun 1.10.0rc13__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of mlrun might be problematic. Click here for more details.

Files changed (107) hide show
  1. mlrun/__init__.py +22 -2
  2. mlrun/artifacts/base.py +0 -31
  3. mlrun/artifacts/document.py +6 -1
  4. mlrun/artifacts/llm_prompt.py +123 -25
  5. mlrun/artifacts/manager.py +0 -5
  6. mlrun/artifacts/model.py +3 -3
  7. mlrun/common/constants.py +10 -1
  8. mlrun/common/formatters/artifact.py +1 -0
  9. mlrun/common/model_monitoring/helpers.py +86 -0
  10. mlrun/common/schemas/__init__.py +3 -0
  11. mlrun/common/schemas/auth.py +2 -0
  12. mlrun/common/schemas/function.py +10 -0
  13. mlrun/common/schemas/hub.py +30 -18
  14. mlrun/common/schemas/model_monitoring/__init__.py +3 -0
  15. mlrun/common/schemas/model_monitoring/constants.py +30 -6
  16. mlrun/common/schemas/model_monitoring/functions.py +14 -5
  17. mlrun/common/schemas/model_monitoring/model_endpoints.py +21 -0
  18. mlrun/common/schemas/pipeline.py +1 -1
  19. mlrun/common/schemas/serving.py +3 -0
  20. mlrun/common/schemas/workflow.py +3 -1
  21. mlrun/common/secrets.py +22 -1
  22. mlrun/config.py +33 -11
  23. mlrun/datastore/__init__.py +11 -3
  24. mlrun/datastore/azure_blob.py +162 -47
  25. mlrun/datastore/datastore.py +9 -4
  26. mlrun/datastore/datastore_profile.py +61 -5
  27. mlrun/datastore/model_provider/huggingface_provider.py +363 -0
  28. mlrun/datastore/model_provider/mock_model_provider.py +87 -0
  29. mlrun/datastore/model_provider/model_provider.py +230 -65
  30. mlrun/datastore/model_provider/openai_provider.py +295 -42
  31. mlrun/datastore/s3.py +24 -2
  32. mlrun/datastore/storeytargets.py +2 -3
  33. mlrun/datastore/utils.py +15 -3
  34. mlrun/db/base.py +47 -19
  35. mlrun/db/httpdb.py +120 -56
  36. mlrun/db/nopdb.py +38 -10
  37. mlrun/execution.py +70 -19
  38. mlrun/hub/__init__.py +15 -0
  39. mlrun/hub/module.py +181 -0
  40. mlrun/k8s_utils.py +105 -16
  41. mlrun/launcher/base.py +13 -6
  42. mlrun/launcher/local.py +15 -0
  43. mlrun/model.py +24 -3
  44. mlrun/model_monitoring/__init__.py +1 -0
  45. mlrun/model_monitoring/api.py +66 -27
  46. mlrun/model_monitoring/applications/__init__.py +1 -1
  47. mlrun/model_monitoring/applications/base.py +509 -117
  48. mlrun/model_monitoring/applications/context.py +2 -4
  49. mlrun/model_monitoring/applications/results.py +4 -7
  50. mlrun/model_monitoring/controller.py +239 -101
  51. mlrun/model_monitoring/db/_schedules.py +116 -33
  52. mlrun/model_monitoring/db/_stats.py +4 -3
  53. mlrun/model_monitoring/db/tsdb/base.py +100 -9
  54. mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +11 -6
  55. mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +191 -50
  56. mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
  57. mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
  58. mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +259 -40
  59. mlrun/model_monitoring/helpers.py +54 -9
  60. mlrun/model_monitoring/stream_processing.py +45 -14
  61. mlrun/model_monitoring/writer.py +220 -1
  62. mlrun/platforms/__init__.py +3 -2
  63. mlrun/platforms/iguazio.py +7 -3
  64. mlrun/projects/operations.py +6 -1
  65. mlrun/projects/pipelines.py +46 -26
  66. mlrun/projects/project.py +166 -58
  67. mlrun/run.py +94 -17
  68. mlrun/runtimes/__init__.py +18 -0
  69. mlrun/runtimes/base.py +14 -6
  70. mlrun/runtimes/daskjob.py +7 -0
  71. mlrun/runtimes/local.py +5 -2
  72. mlrun/runtimes/mounts.py +20 -2
  73. mlrun/runtimes/mpijob/abstract.py +6 -0
  74. mlrun/runtimes/mpijob/v1.py +6 -0
  75. mlrun/runtimes/nuclio/__init__.py +1 -0
  76. mlrun/runtimes/nuclio/application/application.py +149 -17
  77. mlrun/runtimes/nuclio/function.py +76 -27
  78. mlrun/runtimes/nuclio/serving.py +97 -15
  79. mlrun/runtimes/pod.py +234 -21
  80. mlrun/runtimes/remotesparkjob.py +6 -0
  81. mlrun/runtimes/sparkjob/spark3job.py +6 -0
  82. mlrun/runtimes/utils.py +49 -11
  83. mlrun/secrets.py +54 -13
  84. mlrun/serving/__init__.py +2 -0
  85. mlrun/serving/remote.py +79 -6
  86. mlrun/serving/routers.py +23 -41
  87. mlrun/serving/server.py +320 -80
  88. mlrun/serving/states.py +725 -157
  89. mlrun/serving/steps.py +62 -0
  90. mlrun/serving/system_steps.py +200 -119
  91. mlrun/serving/v2_serving.py +9 -10
  92. mlrun/utils/helpers.py +288 -88
  93. mlrun/utils/logger.py +3 -1
  94. mlrun/utils/notifications/notification/base.py +18 -0
  95. mlrun/utils/notifications/notification/git.py +2 -4
  96. mlrun/utils/notifications/notification/slack.py +2 -4
  97. mlrun/utils/notifications/notification/webhook.py +2 -5
  98. mlrun/utils/notifications/notification_pusher.py +1 -1
  99. mlrun/utils/retryer.py +15 -2
  100. mlrun/utils/version/version.json +2 -2
  101. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/METADATA +45 -51
  102. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/RECORD +106 -101
  103. mlrun/api/schemas/__init__.py +0 -259
  104. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/WHEEL +0 -0
  105. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/entry_points.txt +0 -0
  106. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/licenses/LICENSE +0 -0
  107. {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/top_level.txt +0 -0
@@ -13,9 +13,12 @@
13
13
  # limitations under the License.
14
14
 
15
15
  import json
16
+ import typing
16
17
  from datetime import datetime, timezone
17
18
  from typing import Any, Callable, NewType, Optional
18
19
 
20
+ import storey
21
+
19
22
  import mlrun.common.model_monitoring
20
23
  import mlrun.common.schemas
21
24
  import mlrun.common.schemas.alert as alert_objects
@@ -31,6 +34,8 @@ from mlrun.common.schemas.model_monitoring.constants import (
31
34
  WriterEvent,
32
35
  WriterEventKind,
33
36
  )
37
+ from mlrun.config import config
38
+ from mlrun.model_monitoring.db import TSDBConnector
34
39
  from mlrun.model_monitoring.db._stats import (
35
40
  ModelMonitoringCurrentStatsFile,
36
41
  ModelMonitoringDriftMeasuresFile,
@@ -73,7 +78,6 @@ class ModelMonitoringWriter(StepToDict):
73
78
  self._tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
74
79
  project=self.project, secret_provider=secret_provider
75
80
  )
76
- self._endpoints_records = {}
77
81
 
78
82
  def _generate_event_on_drift(
79
83
  self,
@@ -226,3 +230,218 @@ class ModelMonitoringWriter(StepToDict):
226
230
  )
227
231
 
228
232
  logger.info("Model monitoring writer finished handling event")
233
+
234
+
235
+ class WriterGraphFactory:
236
+ def __init__(
237
+ self,
238
+ parquet_path: str,
239
+ ):
240
+ self.parquet_path = parquet_path
241
+ self.parquet_batching_max_events = (
242
+ config.model_endpoint_monitoring.writer_graph.max_events
243
+ )
244
+ self.parquet_batching_timeout_secs = (
245
+ config.model_endpoint_monitoring.writer_graph.parquet_batching_timeout_secs
246
+ )
247
+
248
+ def apply_writer_graph(
249
+ self,
250
+ fn: mlrun.runtimes.ServingRuntime,
251
+ tsdb_connector: TSDBConnector,
252
+ ):
253
+ graph = typing.cast(
254
+ mlrun.serving.states.RootFlowStep,
255
+ fn.set_topology(mlrun.serving.states.StepKinds.flow, engine="async"),
256
+ )
257
+
258
+ graph.to("ReconstructWriterEvent", "event_reconstructor")
259
+ step = tsdb_connector.add_pre_writer_steps(
260
+ graph=graph, after="event_reconstructor"
261
+ )
262
+ before_choice = step.name if step else "event_reconstructor"
263
+ graph.add_step("KindChoice", "kind_choice_step", after=before_choice)
264
+ tsdb_connector.apply_writer_steps(
265
+ graph=graph,
266
+ after="kind_choice_step",
267
+ )
268
+ graph.add_step(
269
+ "AlertGenerator",
270
+ "alert_generator",
271
+ after="kind_choice_step",
272
+ project=fn.metadata.project,
273
+ )
274
+ graph.add_step(
275
+ "storey.Filter",
276
+ name="filter_none",
277
+ _fn="(event is not None)",
278
+ after="alert_generator",
279
+ )
280
+ graph.add_step(
281
+ "mlrun.serving.remote.MLRunAPIRemoteStep",
282
+ name="alert_generator_api_call",
283
+ after="filter_none",
284
+ method="POST",
285
+ path=f"projects/{fn.metadata.project}/events/{{kind}}",
286
+ fill_placeholders=True,
287
+ )
288
+
289
+ graph.add_step(
290
+ "mlrun.datastore.storeytargets.ParquetStoreyTarget",
291
+ alternative_v3io_access_key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.ACCESS_KEY,
292
+ name="stats_writer",
293
+ after="kind_choice_step",
294
+ graph_shape="cylinder",
295
+ path=self.parquet_path
296
+ if self.parquet_path.endswith("/")
297
+ else self.parquet_path + "/",
298
+ max_events=self.parquet_batching_max_events,
299
+ flush_after_seconds=self.parquet_batching_timeout_secs,
300
+ columns=[
301
+ StatsData.TIMESTAMP,
302
+ StatsData.STATS,
303
+ WriterEvent.ENDPOINT_ID,
304
+ StatsData.STATS_NAME,
305
+ ],
306
+ partition_cols=[WriterEvent.ENDPOINT_ID, StatsData.STATS_NAME],
307
+ single_file=True,
308
+ )
309
+
310
+
311
+ class ReconstructWriterEvent(storey.MapClass):
312
+ def __init__(self):
313
+ super().__init__()
314
+
315
+ def do(self, event: dict) -> dict[str, Any]:
316
+ logger.info("Reconstructing the event", event=event)
317
+ kind = event.pop(WriterEvent.EVENT_KIND, WriterEventKind.RESULT)
318
+ result_event = _AppResultEvent(json.loads(event.pop(WriterEvent.DATA, "{}")))
319
+ result_event.update(_AppResultEvent(event))
320
+
321
+ expected_keys = list(
322
+ set(WriterEvent.list()).difference(
323
+ [WriterEvent.EVENT_KIND, WriterEvent.DATA]
324
+ )
325
+ )
326
+ if kind == WriterEventKind.METRIC:
327
+ expected_keys.extend(MetricData.list())
328
+ elif kind == WriterEventKind.RESULT:
329
+ expected_keys.extend(ResultData.list())
330
+ elif kind == WriterEventKind.STATS:
331
+ expected_keys.extend(StatsData.list())
332
+ else:
333
+ raise _WriterEventValueError(
334
+ f"Unknown event kind: {kind}, expected one of: {WriterEventKind.list()}"
335
+ )
336
+ missing_keys = [key for key in expected_keys if key not in result_event]
337
+ if missing_keys:
338
+ raise _WriterEventValueError(
339
+ f"The received event misses some keys compared to the expected "
340
+ f"monitoring application event schema: {missing_keys} for event kind {kind}"
341
+ )
342
+ result_event["kind"] = kind
343
+ if kind in WriterEventKind.user_app_outputs():
344
+ result_event[WriterEvent.END_INFER_TIME] = datetime.fromisoformat(
345
+ event[WriterEvent.END_INFER_TIME]
346
+ )
347
+ if kind == WriterEventKind.STATS:
348
+ result_event[StatsData.STATS] = json.dumps(result_event[StatsData.STATS])
349
+ return result_event
350
+
351
+
352
+ class KindChoice(storey.Choice):
353
+ def select_outlets(self, event):
354
+ kind = event.get("kind")
355
+ logger.info("Selecting the outlet for the event", kind=kind)
356
+ if kind == WriterEventKind.METRIC:
357
+ outlets = ["tsdb_metrics"]
358
+ elif kind == WriterEventKind.RESULT:
359
+ outlets = ["tsdb_app_results", "alert_generator"]
360
+ elif kind == WriterEventKind.STATS:
361
+ outlets = ["stats_writer"]
362
+ else:
363
+ raise _WriterEventValueError(
364
+ f"Unknown event kind: {kind}, expected one of: {WriterEventKind.list()}"
365
+ )
366
+ return outlets
367
+
368
+
369
+ class AlertGenerator(storey.MapClass):
370
+ def __init__(self, project: str, **kwargs):
371
+ self.project = project
372
+ super().__init__(**kwargs)
373
+
374
+ def do(self, event: dict) -> Optional[dict[str, Any]]:
375
+ kind = event.pop(WriterEvent.EVENT_KIND, WriterEventKind.RESULT)
376
+ if (
377
+ mlrun.mlconf.alerts.mode == mlrun.common.schemas.alert.AlertsModes.enabled
378
+ and kind == WriterEventKind.RESULT
379
+ and (
380
+ event[ResultData.RESULT_STATUS] == ResultStatusApp.detected.value
381
+ or event[ResultData.RESULT_STATUS]
382
+ == ResultStatusApp.potential_detection.value
383
+ )
384
+ ):
385
+ event_value = {
386
+ "app_name": event[WriterEvent.APPLICATION_NAME],
387
+ "model": event[WriterEvent.ENDPOINT_NAME],
388
+ "model_endpoint_id": event[WriterEvent.ENDPOINT_ID],
389
+ "result_name": event[ResultData.RESULT_NAME],
390
+ "result_value": event[ResultData.RESULT_VALUE],
391
+ }
392
+ data = self._generate_event_data(
393
+ entity_id=get_result_instance_fqn(
394
+ event[WriterEvent.ENDPOINT_ID],
395
+ event[WriterEvent.APPLICATION_NAME],
396
+ event[ResultData.RESULT_NAME],
397
+ ),
398
+ result_status=event[ResultData.RESULT_STATUS],
399
+ event_value=event_value,
400
+ project_name=self.project,
401
+ result_kind=event[ResultData.RESULT_KIND],
402
+ )
403
+ event = data.dict()
404
+ logger.info("Generated alert event", event=event)
405
+ return event
406
+ return None
407
+
408
+ @staticmethod
409
+ def _generate_alert_event_kind(
410
+ result_kind: int, result_status: int
411
+ ) -> alert_objects.EventKind:
412
+ """Generate the required Event Kind format for the alerting system"""
413
+ event_kind = ResultKindApp(value=result_kind).name
414
+
415
+ if result_status == ResultStatusApp.detected.value:
416
+ event_kind = f"{event_kind}_detected"
417
+ else:
418
+ event_kind = f"{event_kind}_suspected"
419
+ return alert_objects.EventKind(
420
+ value=mlrun.utils.helpers.normalize_name(event_kind)
421
+ )
422
+
423
+ def _generate_event_data(
424
+ self,
425
+ entity_id: str,
426
+ result_status: int,
427
+ event_value: dict,
428
+ project_name: str,
429
+ result_kind: int,
430
+ ) -> mlrun.common.schemas.Event:
431
+ entity = mlrun.common.schemas.alert.EventEntities(
432
+ kind=alert_objects.EventEntityKind.MODEL_ENDPOINT_RESULT,
433
+ project=project_name,
434
+ ids=[entity_id],
435
+ )
436
+
437
+ event_kind = self._generate_alert_event_kind(
438
+ result_status=result_status, result_kind=result_kind
439
+ )
440
+
441
+ event_data = mlrun.common.schemas.Event(
442
+ kind=alert_objects.EventKind(value=event_kind),
443
+ entity=entity,
444
+ value_dict=event_value,
445
+ )
446
+
447
+ return event_data
@@ -25,6 +25,7 @@ from .iguazio import (
25
25
  )
26
26
 
27
27
 
28
+ # TODO: Remove in 1.11.0
28
29
  class _DeprecationHelper:
29
30
  """A helper class to deprecate old schemas"""
30
31
 
@@ -48,12 +49,12 @@ class _DeprecationHelper:
48
49
  def _warn(self):
49
50
  warnings.warn(
50
51
  f"mlrun.platforms.{self._new_target} is deprecated since version {self._version}, "
51
- f"and will be removed in 1.10. Use mlrun.runtimes.mounts.{self._new_target} instead.",
52
+ f"and will be removed in 1.11.0. Use mlrun.runtimes.mounts.{self._new_target} instead.",
52
53
  FutureWarning,
53
54
  )
54
55
 
55
56
 
56
- # TODO: Remove in 1.10
57
+ # TODO: Remove in 1.11.0
57
58
  # For backwards compatibility
58
59
  VolumeMount = _DeprecationHelper("VolumeMount")
59
60
  auto_mount = _DeprecationHelper("auto_mount")
@@ -96,7 +96,11 @@ class OutputStream:
96
96
  if access_key:
97
97
  v3io_client_kwargs["access_key"] = access_key
98
98
 
99
- self._v3io_client = v3io.dataplane.Client(**v3io_client_kwargs)
99
+ if not mock:
100
+ self._v3io_client = v3io.dataplane.Client(**v3io_client_kwargs)
101
+ else:
102
+ self._v3io_client = None
103
+
100
104
  self._container, self._stream_path = split_path(stream_path)
101
105
  self._shards = shards
102
106
  self._retention_in_hours = retention_in_hours
@@ -105,7 +109,7 @@ class OutputStream:
105
109
  self._mock = mock
106
110
  self._mock_queue = []
107
111
 
108
- def create_stream(self):
112
+ def create_stream(self) -> None:
109
113
  # this import creates an import loop via the utils module, so putting it in execution path
110
114
  from mlrun.utils.helpers import logger
111
115
 
@@ -210,7 +214,7 @@ class KafkaOutputStream:
210
214
  self._initialized = False
211
215
 
212
216
  def _lazy_init(self):
213
- if self._initialized:
217
+ if self._initialized or self._mock:
214
218
  return
215
219
 
216
220
  import kafka
@@ -177,7 +177,12 @@ def run_function(
177
177
  This ensures latest code changes are executed. This argument must be used in
178
178
  conjunction with the local=True argument.
179
179
  :param output_path: path to store artifacts, when running in a workflow this will be set automatically
180
- :param retry: Retry configuration for the run, can be a dict or an instance of mlrun.model.Retry.
180
+ :param retry: Retry configuration for the run, can be a dict or an instance of
181
+ :py:class:`~mlrun.model.Retry`.
182
+ The `count` field in the `Retry` object specifies the number of retry attempts.
183
+ If `count=0`, the run will not be retried.
184
+ The `backoff` field specifies the retry backoff strategy between retry attempts.
185
+ If not provided, the default backoff delay is 30 seconds.
181
186
  :return: MLRun RunObject or PipelineNodeWrapper
182
187
  """
183
188
  if artifact_path:
@@ -228,11 +228,11 @@ class _PipelineContext:
228
228
  force_run_local = mlrun.mlconf.force_run_local
229
229
  if force_run_local is None or force_run_local == "auto":
230
230
  force_run_local = not mlrun.mlconf.is_api_running_on_k8s()
231
+
232
+ if self.workflow:
231
233
  if not mlrun.mlconf.kfp_url:
232
234
  logger.debug("Kubeflow pipeline URL is not set, running locally")
233
235
  force_run_local = True
234
-
235
- if self.workflow:
236
236
  force_run_local = force_run_local or self.workflow.run_local
237
237
 
238
238
  return force_run_local
@@ -1072,7 +1072,11 @@ def github_webhook(request):
1072
1072
 
1073
1073
 
1074
1074
  def rerun_workflow(
1075
- context: mlrun.execution.MLClientCtx, run_uid: str, project_name: str
1075
+ context: mlrun.execution.MLClientCtx,
1076
+ run_uid: str,
1077
+ project_name: str,
1078
+ original_runner_uid: str,
1079
+ original_workflow_name: str,
1076
1080
  ):
1077
1081
  """
1078
1082
  Re-run a workflow by retrying a previously failed KFP pipeline.
@@ -1080,8 +1084,11 @@ def rerun_workflow(
1080
1084
  :param context: MLRun context.
1081
1085
  :param run_uid: The run UID of the original workflow to retry.
1082
1086
  :param project_name: The project name.
1087
+ :param original_runner_uid: The original workflow runner UID.
1088
+ :param original_workflow_name: The original workflow name.
1083
1089
  """
1084
1090
  db = mlrun.get_run_db()
1091
+ new_pipeline_id = None
1085
1092
 
1086
1093
  try:
1087
1094
  # Invoke the KFP retry endpoint (direct-submit mode)
@@ -1096,6 +1103,24 @@ def rerun_workflow(
1096
1103
  rerun_of_workflow=run_uid,
1097
1104
  )
1098
1105
 
1106
+ # Enqueue "running" notifications server-side for this RerunRunner run
1107
+ db.push_run_notifications(context.uid, project_name)
1108
+
1109
+ context.set_label(
1110
+ mlrun_constants.MLRunInternalLabels.workflow_id, new_pipeline_id
1111
+ )
1112
+ context.update_run()
1113
+
1114
+ context.log_result("workflow_id", new_pipeline_id)
1115
+
1116
+ pipeline = wait_for_pipeline_completion(
1117
+ new_pipeline_id,
1118
+ project=project_name,
1119
+ )
1120
+
1121
+ final_state = pipeline["run"]["status"]
1122
+ context.log_result("workflow_state", final_state, commit=True)
1123
+
1099
1124
  except mlrun.errors.MLRunHTTPError as http_exc:
1100
1125
  logger.error(
1101
1126
  "Failed calling KFP retry API",
@@ -1104,33 +1129,28 @@ def rerun_workflow(
1104
1129
  )
1105
1130
  raise
1106
1131
 
1107
- # Enqueue "running" notifications server-side for this RerunRunner run
1108
- db.push_run_notifications(context.uid, project_name)
1109
-
1110
- context.set_label(mlrun_constants.MLRunInternalLabels.workflow_id, new_pipeline_id)
1111
- context.update_run()
1112
-
1113
- context.log_result("workflow_id", new_pipeline_id)
1114
-
1115
- try:
1116
- pipeline = wait_for_pipeline_completion(
1117
- new_pipeline_id,
1118
- project=project_name,
1119
- )
1120
1132
  except Exception as exc:
1121
- mlrun.utils.logger.error(
1122
- "Failed waiting for workflow completion",
1133
+ logger.error(
1134
+ "Error during rerun_workflow execution",
1135
+ error=err_to_str(exc),
1123
1136
  rerun_pipeline_id=new_pipeline_id,
1124
- exc=err_to_str(exc),
1125
1137
  )
1126
- else:
1127
- final_state = pipeline["run"]["status"]
1128
- context.log_result("workflow_state", final_state, commit=True)
1138
+ raise
1129
1139
 
1130
- if final_state != mlrun_pipelines.common.models.RunStatuses.succeeded:
1131
- raise mlrun.errors.MLRunRuntimeError(
1132
- f"Pipeline retry of {run_uid} finished in state={final_state}"
1133
- )
1140
+ finally:
1141
+ # Once the rerun has finished, clear the “retrying” label on the original runner
1142
+ # so that subsequent retry requests can acquire the lock again.
1143
+ db.set_run_retrying_status(
1144
+ project=project_name,
1145
+ name=original_workflow_name,
1146
+ run_id=original_runner_uid,
1147
+ retrying=False,
1148
+ )
1149
+
1150
+ if final_state != mlrun_pipelines.common.models.RunStatuses.succeeded:
1151
+ raise mlrun.errors.MLRunRuntimeError(
1152
+ f"Pipeline retry of {run_uid} finished in state={final_state}"
1153
+ )
1134
1154
 
1135
1155
 
1136
1156
  def load_and_run(context, *args, **kwargs):