mlrun 1.10.0rc13__py3-none-any.whl → 1.10.0rc42__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +22 -2
- mlrun/artifacts/base.py +0 -31
- mlrun/artifacts/document.py +6 -1
- mlrun/artifacts/llm_prompt.py +123 -25
- mlrun/artifacts/manager.py +0 -5
- mlrun/artifacts/model.py +3 -3
- mlrun/common/constants.py +10 -1
- mlrun/common/formatters/artifact.py +1 -0
- mlrun/common/model_monitoring/helpers.py +86 -0
- mlrun/common/schemas/__init__.py +3 -0
- mlrun/common/schemas/auth.py +2 -0
- mlrun/common/schemas/function.py +10 -0
- mlrun/common/schemas/hub.py +30 -18
- mlrun/common/schemas/model_monitoring/__init__.py +3 -0
- mlrun/common/schemas/model_monitoring/constants.py +30 -6
- mlrun/common/schemas/model_monitoring/functions.py +14 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +21 -0
- mlrun/common/schemas/pipeline.py +1 -1
- mlrun/common/schemas/serving.py +3 -0
- mlrun/common/schemas/workflow.py +3 -1
- mlrun/common/secrets.py +22 -1
- mlrun/config.py +33 -11
- mlrun/datastore/__init__.py +11 -3
- mlrun/datastore/azure_blob.py +162 -47
- mlrun/datastore/datastore.py +9 -4
- mlrun/datastore/datastore_profile.py +61 -5
- mlrun/datastore/model_provider/huggingface_provider.py +363 -0
- mlrun/datastore/model_provider/mock_model_provider.py +87 -0
- mlrun/datastore/model_provider/model_provider.py +230 -65
- mlrun/datastore/model_provider/openai_provider.py +295 -42
- mlrun/datastore/s3.py +24 -2
- mlrun/datastore/storeytargets.py +2 -3
- mlrun/datastore/utils.py +15 -3
- mlrun/db/base.py +47 -19
- mlrun/db/httpdb.py +120 -56
- mlrun/db/nopdb.py +38 -10
- mlrun/execution.py +70 -19
- mlrun/hub/__init__.py +15 -0
- mlrun/hub/module.py +181 -0
- mlrun/k8s_utils.py +105 -16
- mlrun/launcher/base.py +13 -6
- mlrun/launcher/local.py +15 -0
- mlrun/model.py +24 -3
- mlrun/model_monitoring/__init__.py +1 -0
- mlrun/model_monitoring/api.py +66 -27
- mlrun/model_monitoring/applications/__init__.py +1 -1
- mlrun/model_monitoring/applications/base.py +509 -117
- mlrun/model_monitoring/applications/context.py +2 -4
- mlrun/model_monitoring/applications/results.py +4 -7
- mlrun/model_monitoring/controller.py +239 -101
- mlrun/model_monitoring/db/_schedules.py +116 -33
- mlrun/model_monitoring/db/_stats.py +4 -3
- mlrun/model_monitoring/db/tsdb/base.py +100 -9
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +11 -6
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +191 -50
- mlrun/model_monitoring/db/tsdb/tdengine/writer_graph_steps.py +51 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +17 -4
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +259 -40
- mlrun/model_monitoring/helpers.py +54 -9
- mlrun/model_monitoring/stream_processing.py +45 -14
- mlrun/model_monitoring/writer.py +220 -1
- mlrun/platforms/__init__.py +3 -2
- mlrun/platforms/iguazio.py +7 -3
- mlrun/projects/operations.py +6 -1
- mlrun/projects/pipelines.py +46 -26
- mlrun/projects/project.py +166 -58
- mlrun/run.py +94 -17
- mlrun/runtimes/__init__.py +18 -0
- mlrun/runtimes/base.py +14 -6
- mlrun/runtimes/daskjob.py +7 -0
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mounts.py +20 -2
- mlrun/runtimes/mpijob/abstract.py +6 -0
- mlrun/runtimes/mpijob/v1.py +6 -0
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/application/application.py +149 -17
- mlrun/runtimes/nuclio/function.py +76 -27
- mlrun/runtimes/nuclio/serving.py +97 -15
- mlrun/runtimes/pod.py +234 -21
- mlrun/runtimes/remotesparkjob.py +6 -0
- mlrun/runtimes/sparkjob/spark3job.py +6 -0
- mlrun/runtimes/utils.py +49 -11
- mlrun/secrets.py +54 -13
- mlrun/serving/__init__.py +2 -0
- mlrun/serving/remote.py +79 -6
- mlrun/serving/routers.py +23 -41
- mlrun/serving/server.py +320 -80
- mlrun/serving/states.py +725 -157
- mlrun/serving/steps.py +62 -0
- mlrun/serving/system_steps.py +200 -119
- mlrun/serving/v2_serving.py +9 -10
- mlrun/utils/helpers.py +288 -88
- mlrun/utils/logger.py +3 -1
- mlrun/utils/notifications/notification/base.py +18 -0
- mlrun/utils/notifications/notification/git.py +2 -4
- mlrun/utils/notifications/notification/slack.py +2 -4
- mlrun/utils/notifications/notification/webhook.py +2 -5
- mlrun/utils/notifications/notification_pusher.py +1 -1
- mlrun/utils/retryer.py +15 -2
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/METADATA +45 -51
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/RECORD +106 -101
- mlrun/api/schemas/__init__.py +0 -259
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc13.dist-info → mlrun-1.10.0rc42.dist-info}/top_level.txt +0 -0
mlrun/model_monitoring/writer.py
CHANGED
|
@@ -13,9 +13,12 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import json
|
|
16
|
+
import typing
|
|
16
17
|
from datetime import datetime, timezone
|
|
17
18
|
from typing import Any, Callable, NewType, Optional
|
|
18
19
|
|
|
20
|
+
import storey
|
|
21
|
+
|
|
19
22
|
import mlrun.common.model_monitoring
|
|
20
23
|
import mlrun.common.schemas
|
|
21
24
|
import mlrun.common.schemas.alert as alert_objects
|
|
@@ -31,6 +34,8 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
31
34
|
WriterEvent,
|
|
32
35
|
WriterEventKind,
|
|
33
36
|
)
|
|
37
|
+
from mlrun.config import config
|
|
38
|
+
from mlrun.model_monitoring.db import TSDBConnector
|
|
34
39
|
from mlrun.model_monitoring.db._stats import (
|
|
35
40
|
ModelMonitoringCurrentStatsFile,
|
|
36
41
|
ModelMonitoringDriftMeasuresFile,
|
|
@@ -73,7 +78,6 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
73
78
|
self._tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
74
79
|
project=self.project, secret_provider=secret_provider
|
|
75
80
|
)
|
|
76
|
-
self._endpoints_records = {}
|
|
77
81
|
|
|
78
82
|
def _generate_event_on_drift(
|
|
79
83
|
self,
|
|
@@ -226,3 +230,218 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
226
230
|
)
|
|
227
231
|
|
|
228
232
|
logger.info("Model monitoring writer finished handling event")
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
class WriterGraphFactory:
|
|
236
|
+
def __init__(
|
|
237
|
+
self,
|
|
238
|
+
parquet_path: str,
|
|
239
|
+
):
|
|
240
|
+
self.parquet_path = parquet_path
|
|
241
|
+
self.parquet_batching_max_events = (
|
|
242
|
+
config.model_endpoint_monitoring.writer_graph.max_events
|
|
243
|
+
)
|
|
244
|
+
self.parquet_batching_timeout_secs = (
|
|
245
|
+
config.model_endpoint_monitoring.writer_graph.parquet_batching_timeout_secs
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
def apply_writer_graph(
|
|
249
|
+
self,
|
|
250
|
+
fn: mlrun.runtimes.ServingRuntime,
|
|
251
|
+
tsdb_connector: TSDBConnector,
|
|
252
|
+
):
|
|
253
|
+
graph = typing.cast(
|
|
254
|
+
mlrun.serving.states.RootFlowStep,
|
|
255
|
+
fn.set_topology(mlrun.serving.states.StepKinds.flow, engine="async"),
|
|
256
|
+
)
|
|
257
|
+
|
|
258
|
+
graph.to("ReconstructWriterEvent", "event_reconstructor")
|
|
259
|
+
step = tsdb_connector.add_pre_writer_steps(
|
|
260
|
+
graph=graph, after="event_reconstructor"
|
|
261
|
+
)
|
|
262
|
+
before_choice = step.name if step else "event_reconstructor"
|
|
263
|
+
graph.add_step("KindChoice", "kind_choice_step", after=before_choice)
|
|
264
|
+
tsdb_connector.apply_writer_steps(
|
|
265
|
+
graph=graph,
|
|
266
|
+
after="kind_choice_step",
|
|
267
|
+
)
|
|
268
|
+
graph.add_step(
|
|
269
|
+
"AlertGenerator",
|
|
270
|
+
"alert_generator",
|
|
271
|
+
after="kind_choice_step",
|
|
272
|
+
project=fn.metadata.project,
|
|
273
|
+
)
|
|
274
|
+
graph.add_step(
|
|
275
|
+
"storey.Filter",
|
|
276
|
+
name="filter_none",
|
|
277
|
+
_fn="(event is not None)",
|
|
278
|
+
after="alert_generator",
|
|
279
|
+
)
|
|
280
|
+
graph.add_step(
|
|
281
|
+
"mlrun.serving.remote.MLRunAPIRemoteStep",
|
|
282
|
+
name="alert_generator_api_call",
|
|
283
|
+
after="filter_none",
|
|
284
|
+
method="POST",
|
|
285
|
+
path=f"projects/{fn.metadata.project}/events/{{kind}}",
|
|
286
|
+
fill_placeholders=True,
|
|
287
|
+
)
|
|
288
|
+
|
|
289
|
+
graph.add_step(
|
|
290
|
+
"mlrun.datastore.storeytargets.ParquetStoreyTarget",
|
|
291
|
+
alternative_v3io_access_key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.ACCESS_KEY,
|
|
292
|
+
name="stats_writer",
|
|
293
|
+
after="kind_choice_step",
|
|
294
|
+
graph_shape="cylinder",
|
|
295
|
+
path=self.parquet_path
|
|
296
|
+
if self.parquet_path.endswith("/")
|
|
297
|
+
else self.parquet_path + "/",
|
|
298
|
+
max_events=self.parquet_batching_max_events,
|
|
299
|
+
flush_after_seconds=self.parquet_batching_timeout_secs,
|
|
300
|
+
columns=[
|
|
301
|
+
StatsData.TIMESTAMP,
|
|
302
|
+
StatsData.STATS,
|
|
303
|
+
WriterEvent.ENDPOINT_ID,
|
|
304
|
+
StatsData.STATS_NAME,
|
|
305
|
+
],
|
|
306
|
+
partition_cols=[WriterEvent.ENDPOINT_ID, StatsData.STATS_NAME],
|
|
307
|
+
single_file=True,
|
|
308
|
+
)
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
class ReconstructWriterEvent(storey.MapClass):
|
|
312
|
+
def __init__(self):
|
|
313
|
+
super().__init__()
|
|
314
|
+
|
|
315
|
+
def do(self, event: dict) -> dict[str, Any]:
|
|
316
|
+
logger.info("Reconstructing the event", event=event)
|
|
317
|
+
kind = event.pop(WriterEvent.EVENT_KIND, WriterEventKind.RESULT)
|
|
318
|
+
result_event = _AppResultEvent(json.loads(event.pop(WriterEvent.DATA, "{}")))
|
|
319
|
+
result_event.update(_AppResultEvent(event))
|
|
320
|
+
|
|
321
|
+
expected_keys = list(
|
|
322
|
+
set(WriterEvent.list()).difference(
|
|
323
|
+
[WriterEvent.EVENT_KIND, WriterEvent.DATA]
|
|
324
|
+
)
|
|
325
|
+
)
|
|
326
|
+
if kind == WriterEventKind.METRIC:
|
|
327
|
+
expected_keys.extend(MetricData.list())
|
|
328
|
+
elif kind == WriterEventKind.RESULT:
|
|
329
|
+
expected_keys.extend(ResultData.list())
|
|
330
|
+
elif kind == WriterEventKind.STATS:
|
|
331
|
+
expected_keys.extend(StatsData.list())
|
|
332
|
+
else:
|
|
333
|
+
raise _WriterEventValueError(
|
|
334
|
+
f"Unknown event kind: {kind}, expected one of: {WriterEventKind.list()}"
|
|
335
|
+
)
|
|
336
|
+
missing_keys = [key for key in expected_keys if key not in result_event]
|
|
337
|
+
if missing_keys:
|
|
338
|
+
raise _WriterEventValueError(
|
|
339
|
+
f"The received event misses some keys compared to the expected "
|
|
340
|
+
f"monitoring application event schema: {missing_keys} for event kind {kind}"
|
|
341
|
+
)
|
|
342
|
+
result_event["kind"] = kind
|
|
343
|
+
if kind in WriterEventKind.user_app_outputs():
|
|
344
|
+
result_event[WriterEvent.END_INFER_TIME] = datetime.fromisoformat(
|
|
345
|
+
event[WriterEvent.END_INFER_TIME]
|
|
346
|
+
)
|
|
347
|
+
if kind == WriterEventKind.STATS:
|
|
348
|
+
result_event[StatsData.STATS] = json.dumps(result_event[StatsData.STATS])
|
|
349
|
+
return result_event
|
|
350
|
+
|
|
351
|
+
|
|
352
|
+
class KindChoice(storey.Choice):
|
|
353
|
+
def select_outlets(self, event):
|
|
354
|
+
kind = event.get("kind")
|
|
355
|
+
logger.info("Selecting the outlet for the event", kind=kind)
|
|
356
|
+
if kind == WriterEventKind.METRIC:
|
|
357
|
+
outlets = ["tsdb_metrics"]
|
|
358
|
+
elif kind == WriterEventKind.RESULT:
|
|
359
|
+
outlets = ["tsdb_app_results", "alert_generator"]
|
|
360
|
+
elif kind == WriterEventKind.STATS:
|
|
361
|
+
outlets = ["stats_writer"]
|
|
362
|
+
else:
|
|
363
|
+
raise _WriterEventValueError(
|
|
364
|
+
f"Unknown event kind: {kind}, expected one of: {WriterEventKind.list()}"
|
|
365
|
+
)
|
|
366
|
+
return outlets
|
|
367
|
+
|
|
368
|
+
|
|
369
|
+
class AlertGenerator(storey.MapClass):
|
|
370
|
+
def __init__(self, project: str, **kwargs):
|
|
371
|
+
self.project = project
|
|
372
|
+
super().__init__(**kwargs)
|
|
373
|
+
|
|
374
|
+
def do(self, event: dict) -> Optional[dict[str, Any]]:
|
|
375
|
+
kind = event.pop(WriterEvent.EVENT_KIND, WriterEventKind.RESULT)
|
|
376
|
+
if (
|
|
377
|
+
mlrun.mlconf.alerts.mode == mlrun.common.schemas.alert.AlertsModes.enabled
|
|
378
|
+
and kind == WriterEventKind.RESULT
|
|
379
|
+
and (
|
|
380
|
+
event[ResultData.RESULT_STATUS] == ResultStatusApp.detected.value
|
|
381
|
+
or event[ResultData.RESULT_STATUS]
|
|
382
|
+
== ResultStatusApp.potential_detection.value
|
|
383
|
+
)
|
|
384
|
+
):
|
|
385
|
+
event_value = {
|
|
386
|
+
"app_name": event[WriterEvent.APPLICATION_NAME],
|
|
387
|
+
"model": event[WriterEvent.ENDPOINT_NAME],
|
|
388
|
+
"model_endpoint_id": event[WriterEvent.ENDPOINT_ID],
|
|
389
|
+
"result_name": event[ResultData.RESULT_NAME],
|
|
390
|
+
"result_value": event[ResultData.RESULT_VALUE],
|
|
391
|
+
}
|
|
392
|
+
data = self._generate_event_data(
|
|
393
|
+
entity_id=get_result_instance_fqn(
|
|
394
|
+
event[WriterEvent.ENDPOINT_ID],
|
|
395
|
+
event[WriterEvent.APPLICATION_NAME],
|
|
396
|
+
event[ResultData.RESULT_NAME],
|
|
397
|
+
),
|
|
398
|
+
result_status=event[ResultData.RESULT_STATUS],
|
|
399
|
+
event_value=event_value,
|
|
400
|
+
project_name=self.project,
|
|
401
|
+
result_kind=event[ResultData.RESULT_KIND],
|
|
402
|
+
)
|
|
403
|
+
event = data.dict()
|
|
404
|
+
logger.info("Generated alert event", event=event)
|
|
405
|
+
return event
|
|
406
|
+
return None
|
|
407
|
+
|
|
408
|
+
@staticmethod
|
|
409
|
+
def _generate_alert_event_kind(
|
|
410
|
+
result_kind: int, result_status: int
|
|
411
|
+
) -> alert_objects.EventKind:
|
|
412
|
+
"""Generate the required Event Kind format for the alerting system"""
|
|
413
|
+
event_kind = ResultKindApp(value=result_kind).name
|
|
414
|
+
|
|
415
|
+
if result_status == ResultStatusApp.detected.value:
|
|
416
|
+
event_kind = f"{event_kind}_detected"
|
|
417
|
+
else:
|
|
418
|
+
event_kind = f"{event_kind}_suspected"
|
|
419
|
+
return alert_objects.EventKind(
|
|
420
|
+
value=mlrun.utils.helpers.normalize_name(event_kind)
|
|
421
|
+
)
|
|
422
|
+
|
|
423
|
+
def _generate_event_data(
|
|
424
|
+
self,
|
|
425
|
+
entity_id: str,
|
|
426
|
+
result_status: int,
|
|
427
|
+
event_value: dict,
|
|
428
|
+
project_name: str,
|
|
429
|
+
result_kind: int,
|
|
430
|
+
) -> mlrun.common.schemas.Event:
|
|
431
|
+
entity = mlrun.common.schemas.alert.EventEntities(
|
|
432
|
+
kind=alert_objects.EventEntityKind.MODEL_ENDPOINT_RESULT,
|
|
433
|
+
project=project_name,
|
|
434
|
+
ids=[entity_id],
|
|
435
|
+
)
|
|
436
|
+
|
|
437
|
+
event_kind = self._generate_alert_event_kind(
|
|
438
|
+
result_status=result_status, result_kind=result_kind
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
event_data = mlrun.common.schemas.Event(
|
|
442
|
+
kind=alert_objects.EventKind(value=event_kind),
|
|
443
|
+
entity=entity,
|
|
444
|
+
value_dict=event_value,
|
|
445
|
+
)
|
|
446
|
+
|
|
447
|
+
return event_data
|
mlrun/platforms/__init__.py
CHANGED
|
@@ -25,6 +25,7 @@ from .iguazio import (
|
|
|
25
25
|
)
|
|
26
26
|
|
|
27
27
|
|
|
28
|
+
# TODO: Remove in 1.11.0
|
|
28
29
|
class _DeprecationHelper:
|
|
29
30
|
"""A helper class to deprecate old schemas"""
|
|
30
31
|
|
|
@@ -48,12 +49,12 @@ class _DeprecationHelper:
|
|
|
48
49
|
def _warn(self):
|
|
49
50
|
warnings.warn(
|
|
50
51
|
f"mlrun.platforms.{self._new_target} is deprecated since version {self._version}, "
|
|
51
|
-
f"and will be removed in 1.
|
|
52
|
+
f"and will be removed in 1.11.0. Use mlrun.runtimes.mounts.{self._new_target} instead.",
|
|
52
53
|
FutureWarning,
|
|
53
54
|
)
|
|
54
55
|
|
|
55
56
|
|
|
56
|
-
# TODO: Remove in 1.
|
|
57
|
+
# TODO: Remove in 1.11.0
|
|
57
58
|
# For backwards compatibility
|
|
58
59
|
VolumeMount = _DeprecationHelper("VolumeMount")
|
|
59
60
|
auto_mount = _DeprecationHelper("auto_mount")
|
mlrun/platforms/iguazio.py
CHANGED
|
@@ -96,7 +96,11 @@ class OutputStream:
|
|
|
96
96
|
if access_key:
|
|
97
97
|
v3io_client_kwargs["access_key"] = access_key
|
|
98
98
|
|
|
99
|
-
|
|
99
|
+
if not mock:
|
|
100
|
+
self._v3io_client = v3io.dataplane.Client(**v3io_client_kwargs)
|
|
101
|
+
else:
|
|
102
|
+
self._v3io_client = None
|
|
103
|
+
|
|
100
104
|
self._container, self._stream_path = split_path(stream_path)
|
|
101
105
|
self._shards = shards
|
|
102
106
|
self._retention_in_hours = retention_in_hours
|
|
@@ -105,7 +109,7 @@ class OutputStream:
|
|
|
105
109
|
self._mock = mock
|
|
106
110
|
self._mock_queue = []
|
|
107
111
|
|
|
108
|
-
def create_stream(self):
|
|
112
|
+
def create_stream(self) -> None:
|
|
109
113
|
# this import creates an import loop via the utils module, so putting it in execution path
|
|
110
114
|
from mlrun.utils.helpers import logger
|
|
111
115
|
|
|
@@ -210,7 +214,7 @@ class KafkaOutputStream:
|
|
|
210
214
|
self._initialized = False
|
|
211
215
|
|
|
212
216
|
def _lazy_init(self):
|
|
213
|
-
if self._initialized:
|
|
217
|
+
if self._initialized or self._mock:
|
|
214
218
|
return
|
|
215
219
|
|
|
216
220
|
import kafka
|
mlrun/projects/operations.py
CHANGED
|
@@ -177,7 +177,12 @@ def run_function(
|
|
|
177
177
|
This ensures latest code changes are executed. This argument must be used in
|
|
178
178
|
conjunction with the local=True argument.
|
|
179
179
|
:param output_path: path to store artifacts, when running in a workflow this will be set automatically
|
|
180
|
-
:param retry: Retry configuration for the run, can be a dict or an instance of
|
|
180
|
+
:param retry: Retry configuration for the run, can be a dict or an instance of
|
|
181
|
+
:py:class:`~mlrun.model.Retry`.
|
|
182
|
+
The `count` field in the `Retry` object specifies the number of retry attempts.
|
|
183
|
+
If `count=0`, the run will not be retried.
|
|
184
|
+
The `backoff` field specifies the retry backoff strategy between retry attempts.
|
|
185
|
+
If not provided, the default backoff delay is 30 seconds.
|
|
181
186
|
:return: MLRun RunObject or PipelineNodeWrapper
|
|
182
187
|
"""
|
|
183
188
|
if artifact_path:
|
mlrun/projects/pipelines.py
CHANGED
|
@@ -228,11 +228,11 @@ class _PipelineContext:
|
|
|
228
228
|
force_run_local = mlrun.mlconf.force_run_local
|
|
229
229
|
if force_run_local is None or force_run_local == "auto":
|
|
230
230
|
force_run_local = not mlrun.mlconf.is_api_running_on_k8s()
|
|
231
|
+
|
|
232
|
+
if self.workflow:
|
|
231
233
|
if not mlrun.mlconf.kfp_url:
|
|
232
234
|
logger.debug("Kubeflow pipeline URL is not set, running locally")
|
|
233
235
|
force_run_local = True
|
|
234
|
-
|
|
235
|
-
if self.workflow:
|
|
236
236
|
force_run_local = force_run_local or self.workflow.run_local
|
|
237
237
|
|
|
238
238
|
return force_run_local
|
|
@@ -1072,7 +1072,11 @@ def github_webhook(request):
|
|
|
1072
1072
|
|
|
1073
1073
|
|
|
1074
1074
|
def rerun_workflow(
|
|
1075
|
-
context: mlrun.execution.MLClientCtx,
|
|
1075
|
+
context: mlrun.execution.MLClientCtx,
|
|
1076
|
+
run_uid: str,
|
|
1077
|
+
project_name: str,
|
|
1078
|
+
original_runner_uid: str,
|
|
1079
|
+
original_workflow_name: str,
|
|
1076
1080
|
):
|
|
1077
1081
|
"""
|
|
1078
1082
|
Re-run a workflow by retrying a previously failed KFP pipeline.
|
|
@@ -1080,8 +1084,11 @@ def rerun_workflow(
|
|
|
1080
1084
|
:param context: MLRun context.
|
|
1081
1085
|
:param run_uid: The run UID of the original workflow to retry.
|
|
1082
1086
|
:param project_name: The project name.
|
|
1087
|
+
:param original_runner_uid: The original workflow runner UID.
|
|
1088
|
+
:param original_workflow_name: The original workflow name.
|
|
1083
1089
|
"""
|
|
1084
1090
|
db = mlrun.get_run_db()
|
|
1091
|
+
new_pipeline_id = None
|
|
1085
1092
|
|
|
1086
1093
|
try:
|
|
1087
1094
|
# Invoke the KFP retry endpoint (direct-submit mode)
|
|
@@ -1096,6 +1103,24 @@ def rerun_workflow(
|
|
|
1096
1103
|
rerun_of_workflow=run_uid,
|
|
1097
1104
|
)
|
|
1098
1105
|
|
|
1106
|
+
# Enqueue "running" notifications server-side for this RerunRunner run
|
|
1107
|
+
db.push_run_notifications(context.uid, project_name)
|
|
1108
|
+
|
|
1109
|
+
context.set_label(
|
|
1110
|
+
mlrun_constants.MLRunInternalLabels.workflow_id, new_pipeline_id
|
|
1111
|
+
)
|
|
1112
|
+
context.update_run()
|
|
1113
|
+
|
|
1114
|
+
context.log_result("workflow_id", new_pipeline_id)
|
|
1115
|
+
|
|
1116
|
+
pipeline = wait_for_pipeline_completion(
|
|
1117
|
+
new_pipeline_id,
|
|
1118
|
+
project=project_name,
|
|
1119
|
+
)
|
|
1120
|
+
|
|
1121
|
+
final_state = pipeline["run"]["status"]
|
|
1122
|
+
context.log_result("workflow_state", final_state, commit=True)
|
|
1123
|
+
|
|
1099
1124
|
except mlrun.errors.MLRunHTTPError as http_exc:
|
|
1100
1125
|
logger.error(
|
|
1101
1126
|
"Failed calling KFP retry API",
|
|
@@ -1104,33 +1129,28 @@ def rerun_workflow(
|
|
|
1104
1129
|
)
|
|
1105
1130
|
raise
|
|
1106
1131
|
|
|
1107
|
-
# Enqueue "running" notifications server-side for this RerunRunner run
|
|
1108
|
-
db.push_run_notifications(context.uid, project_name)
|
|
1109
|
-
|
|
1110
|
-
context.set_label(mlrun_constants.MLRunInternalLabels.workflow_id, new_pipeline_id)
|
|
1111
|
-
context.update_run()
|
|
1112
|
-
|
|
1113
|
-
context.log_result("workflow_id", new_pipeline_id)
|
|
1114
|
-
|
|
1115
|
-
try:
|
|
1116
|
-
pipeline = wait_for_pipeline_completion(
|
|
1117
|
-
new_pipeline_id,
|
|
1118
|
-
project=project_name,
|
|
1119
|
-
)
|
|
1120
1132
|
except Exception as exc:
|
|
1121
|
-
|
|
1122
|
-
"
|
|
1133
|
+
logger.error(
|
|
1134
|
+
"Error during rerun_workflow execution",
|
|
1135
|
+
error=err_to_str(exc),
|
|
1123
1136
|
rerun_pipeline_id=new_pipeline_id,
|
|
1124
|
-
exc=err_to_str(exc),
|
|
1125
1137
|
)
|
|
1126
|
-
|
|
1127
|
-
final_state = pipeline["run"]["status"]
|
|
1128
|
-
context.log_result("workflow_state", final_state, commit=True)
|
|
1138
|
+
raise
|
|
1129
1139
|
|
|
1130
|
-
|
|
1131
|
-
|
|
1132
|
-
|
|
1133
|
-
|
|
1140
|
+
finally:
|
|
1141
|
+
# Once the rerun has finished, clear the “retrying” label on the original runner
|
|
1142
|
+
# so that subsequent retry requests can acquire the lock again.
|
|
1143
|
+
db.set_run_retrying_status(
|
|
1144
|
+
project=project_name,
|
|
1145
|
+
name=original_workflow_name,
|
|
1146
|
+
run_id=original_runner_uid,
|
|
1147
|
+
retrying=False,
|
|
1148
|
+
)
|
|
1149
|
+
|
|
1150
|
+
if final_state != mlrun_pipelines.common.models.RunStatuses.succeeded:
|
|
1151
|
+
raise mlrun.errors.MLRunRuntimeError(
|
|
1152
|
+
f"Pipeline retry of {run_uid} finished in state={final_state}"
|
|
1153
|
+
)
|
|
1134
1154
|
|
|
1135
1155
|
|
|
1136
1156
|
def load_and_run(context, *args, **kwargs):
|