mlrun 1.7.0rc4__py3-none-any.whl → 1.7.2__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +39 -121
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +248 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +39 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +73 -46
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +73 -1
- mlrun/common/db/sql_session.py +3 -2
- mlrun/common/formatters/__init__.py +21 -0
- mlrun/common/formatters/artifact.py +46 -0
- mlrun/common/formatters/base.py +113 -0
- mlrun/common/formatters/feature_set.py +44 -0
- mlrun/common/formatters/function.py +46 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/common/formatters/run.py +29 -0
- mlrun/common/helpers.py +11 -1
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +31 -4
- mlrun/common/schemas/alert.py +202 -0
- mlrun/common/schemas/api_gateway.py +196 -0
- mlrun/common/schemas/artifact.py +28 -1
- mlrun/common/schemas/auth.py +13 -2
- mlrun/common/schemas/client_spec.py +2 -1
- mlrun/common/schemas/common.py +7 -4
- mlrun/common/schemas/constants.py +3 -0
- mlrun/common/schemas/feature_store.py +58 -28
- mlrun/common/schemas/frontend_spec.py +8 -0
- mlrun/common/schemas/function.py +11 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +21 -4
- mlrun/common/schemas/model_monitoring/constants.py +136 -42
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +89 -41
- mlrun/common/schemas/notification.py +69 -12
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +7 -0
- mlrun/common/schemas/project.py +67 -16
- mlrun/common/schemas/runs.py +17 -0
- mlrun/common/schemas/schedule.py +1 -1
- mlrun/common/schemas/workflow.py +10 -2
- mlrun/common/types.py +14 -1
- mlrun/config.py +233 -58
- mlrun/data_types/data_types.py +11 -1
- mlrun/data_types/spark.py +5 -4
- mlrun/data_types/to_pandas.py +75 -34
- mlrun/datastore/__init__.py +8 -10
- mlrun/datastore/alibaba_oss.py +131 -0
- mlrun/datastore/azure_blob.py +131 -43
- mlrun/datastore/base.py +107 -47
- mlrun/datastore/datastore.py +17 -7
- mlrun/datastore/datastore_profile.py +91 -7
- mlrun/datastore/dbfs_store.py +3 -7
- mlrun/datastore/filestore.py +1 -3
- mlrun/datastore/google_cloud_storage.py +92 -32
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +6 -3
- mlrun/datastore/redis.py +3 -2
- mlrun/datastore/s3.py +30 -12
- mlrun/datastore/snowflake_utils.py +45 -0
- mlrun/datastore/sources.py +274 -59
- mlrun/datastore/spark_utils.py +30 -0
- mlrun/datastore/store_resources.py +9 -7
- mlrun/datastore/storeytargets.py +151 -0
- mlrun/datastore/targets.py +387 -119
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +28 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +245 -20
- mlrun/db/factory.py +1 -4
- mlrun/db/httpdb.py +909 -231
- mlrun/db/nopdb.py +279 -14
- mlrun/errors.py +35 -5
- mlrun/execution.py +111 -38
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +46 -53
- mlrun/feature_store/common.py +6 -11
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +13 -2
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +13 -4
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +24 -32
- mlrun/feature_store/steps.py +38 -19
- mlrun/features.py +6 -14
- mlrun/frameworks/_common/plan.py +3 -3
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/_ml_common/plan.py +1 -1
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +4 -4
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/sklearn/mlrun_interface.py +13 -3
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +57 -12
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +13 -11
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +15 -5
- mlrun/launcher/remote.py +10 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +297 -48
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +152 -357
- mlrun/model_monitoring/applications/__init__.py +10 -0
- mlrun/model_monitoring/applications/_application_steps.py +190 -0
- mlrun/model_monitoring/applications/base.py +108 -0
- mlrun/model_monitoring/applications/context.py +341 -0
- mlrun/model_monitoring/{evidently_application.py → applications/evidently_base.py} +27 -22
- mlrun/model_monitoring/applications/histogram_data_drift.py +227 -91
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +130 -303
- mlrun/model_monitoring/{stores/models/sqlite.py → db/__init__.py} +5 -10
- mlrun/model_monitoring/db/stores/__init__.py +136 -0
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/db/stores/base/store.py +213 -0
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/db/stores/sqldb/models/base.py +190 -0
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +103 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +659 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +726 -0
- mlrun/model_monitoring/db/tsdb/__init__.py +105 -0
- mlrun/model_monitoring/db/tsdb/base.py +448 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +298 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +42 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +522 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +158 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +849 -0
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +177 -39
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +165 -398
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +161 -125
- mlrun/package/packagers/default_packager.py +2 -2
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +67 -228
- mlrun/projects/__init__.py +6 -1
- mlrun/projects/operations.py +47 -20
- mlrun/projects/pipelines.py +396 -249
- mlrun/projects/project.py +1176 -406
- mlrun/render.py +28 -22
- mlrun/run.py +208 -181
- mlrun/runtimes/__init__.py +76 -11
- mlrun/runtimes/base.py +54 -24
- mlrun/runtimes/daskjob.py +9 -2
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +1 -29
- mlrun/runtimes/kubejob.py +34 -128
- mlrun/runtimes/local.py +39 -10
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/api_gateway.py +769 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +758 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +188 -68
- mlrun/runtimes/nuclio/serving.py +57 -60
- mlrun/runtimes/pod.py +191 -58
- mlrun/runtimes/remotesparkjob.py +11 -8
- mlrun/runtimes/sparkjob/spark3job.py +17 -18
- mlrun/runtimes/utils.py +40 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/__init__.py +8 -1
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +89 -64
- mlrun/serving/server.py +54 -26
- mlrun/serving/states.py +187 -56
- mlrun/serving/utils.py +19 -11
- mlrun/serving/v2_serving.py +136 -63
- mlrun/track/tracker.py +2 -1
- mlrun/track/trackers/mlflow_tracker.py +5 -0
- mlrun/utils/async_http.py +26 -6
- mlrun/utils/db.py +18 -0
- mlrun/utils/helpers.py +375 -105
- mlrun/utils/http.py +2 -2
- mlrun/utils/logger.py +75 -9
- mlrun/utils/notifications/notification/__init__.py +14 -10
- mlrun/utils/notifications/notification/base.py +48 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +24 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +96 -21
- mlrun/utils/notifications/notification/webhook.py +63 -2
- mlrun/utils/notifications/notification_pusher.py +146 -16
- mlrun/utils/regex.py +9 -0
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +2 -3
- mlrun/utils/version/version.json +2 -2
- mlrun-1.7.2.dist-info/METADATA +390 -0
- mlrun-1.7.2.dist-info/RECORD +351 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/WHEEL +1 -1
- mlrun/feature_store/retrieval/conversion.py +0 -271
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/application.py +0 -310
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/controller_handler.py +0 -37
- mlrun/model_monitoring/prometheus.py +0 -216
- mlrun/model_monitoring/stores/__init__.py +0 -111
- mlrun/model_monitoring/stores/kv_model_endpoint_store.py +0 -574
- mlrun/model_monitoring/stores/model_endpoint_store.py +0 -145
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/models/base.py +0 -84
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc4.dist-info/METADATA +0 -269
- mlrun-1.7.0rc4.dist-info/RECORD +0 -321
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.2.dist-info}/top_level.txt +0 -0
mlrun/projects/pipelines.py
CHANGED
|
@@ -13,24 +13,27 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
import abc
|
|
15
15
|
import builtins
|
|
16
|
+
import http
|
|
16
17
|
import importlib.util as imputil
|
|
17
18
|
import os
|
|
18
19
|
import tempfile
|
|
19
20
|
import typing
|
|
20
21
|
import uuid
|
|
21
22
|
|
|
22
|
-
import
|
|
23
|
-
|
|
24
|
-
|
|
23
|
+
import mlrun_pipelines.common.models
|
|
24
|
+
import mlrun_pipelines.patcher
|
|
25
|
+
import mlrun_pipelines.utils
|
|
25
26
|
|
|
26
27
|
import mlrun
|
|
28
|
+
import mlrun.common.runtimes.constants
|
|
27
29
|
import mlrun.common.schemas
|
|
30
|
+
import mlrun.common.schemas.function
|
|
31
|
+
import mlrun.common.schemas.workflow
|
|
28
32
|
import mlrun.utils.notifications
|
|
29
33
|
from mlrun.errors import err_to_str
|
|
30
34
|
from mlrun.utils import (
|
|
31
35
|
get_ui_url,
|
|
32
36
|
logger,
|
|
33
|
-
new_pipe_metadata,
|
|
34
37
|
normalize_workflow_name,
|
|
35
38
|
retry_until_successful,
|
|
36
39
|
)
|
|
@@ -43,21 +46,21 @@ from ..runtimes.pod import AutoMountType
|
|
|
43
46
|
|
|
44
47
|
def get_workflow_engine(engine_kind, local=False):
|
|
45
48
|
if pipeline_context.is_run_local(local):
|
|
46
|
-
if engine_kind ==
|
|
49
|
+
if engine_kind == mlrun.common.schemas.workflow.EngineType.KFP:
|
|
47
50
|
logger.warning(
|
|
48
51
|
"Running kubeflow pipeline locally, note some ops may not run locally!"
|
|
49
52
|
)
|
|
50
|
-
elif engine_kind ==
|
|
53
|
+
elif engine_kind == mlrun.common.schemas.workflow.EngineType.REMOTE:
|
|
51
54
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
52
55
|
"Cannot run a remote pipeline locally using `kind='remote'` and `local=True`. "
|
|
53
56
|
"in order to run a local pipeline remotely, please use `engine='remote:local'` instead"
|
|
54
57
|
)
|
|
55
58
|
return _LocalRunner
|
|
56
|
-
if not engine_kind or engine_kind ==
|
|
59
|
+
if not engine_kind or engine_kind == mlrun.common.schemas.workflow.EngineType.KFP:
|
|
57
60
|
return _KFPRunner
|
|
58
|
-
if engine_kind ==
|
|
61
|
+
if engine_kind == mlrun.common.schemas.workflow.EngineType.LOCAL:
|
|
59
62
|
return _LocalRunner
|
|
60
|
-
if engine_kind ==
|
|
63
|
+
if engine_kind == mlrun.common.schemas.workflow.EngineType.REMOTE:
|
|
61
64
|
return _RemoteRunner
|
|
62
65
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
63
66
|
f"Provided workflow engine is not supported. engine_kind={engine_kind}"
|
|
@@ -79,6 +82,7 @@ class WorkflowSpec(mlrun.model.ModelObj):
|
|
|
79
82
|
schedule: typing.Union[str, mlrun.common.schemas.ScheduleCronTrigger] = None,
|
|
80
83
|
cleanup_ttl: typing.Optional[int] = None,
|
|
81
84
|
image: typing.Optional[str] = None,
|
|
85
|
+
workflow_runner_node_selector: typing.Optional[dict[str, str]] = None,
|
|
82
86
|
):
|
|
83
87
|
self.engine = engine
|
|
84
88
|
self.code = code
|
|
@@ -92,6 +96,7 @@ class WorkflowSpec(mlrun.model.ModelObj):
|
|
|
92
96
|
self._tmp_path = None
|
|
93
97
|
self.schedule = schedule
|
|
94
98
|
self.image = image
|
|
99
|
+
self.workflow_runner_node_selector = workflow_runner_node_selector
|
|
95
100
|
|
|
96
101
|
def get_source_file(self, context=""):
|
|
97
102
|
if not self.code and not self.path:
|
|
@@ -218,9 +223,10 @@ class _PipelineContext:
|
|
|
218
223
|
force_run_local = mlrun.mlconf.force_run_local
|
|
219
224
|
if force_run_local is None or force_run_local == "auto":
|
|
220
225
|
force_run_local = not mlrun.mlconf.is_api_running_on_k8s()
|
|
221
|
-
|
|
222
|
-
|
|
226
|
+
if not mlrun.mlconf.kfp_url:
|
|
227
|
+
logger.debug("Kubeflow pipeline URL is not set, running locally")
|
|
223
228
|
force_run_local = True
|
|
229
|
+
|
|
224
230
|
if self.workflow:
|
|
225
231
|
force_run_local = force_run_local or self.workflow.run_local
|
|
226
232
|
|
|
@@ -300,72 +306,6 @@ def _enrich_kfp_pod_security_context(kfp_pod_template, function):
|
|
|
300
306
|
}
|
|
301
307
|
|
|
302
308
|
|
|
303
|
-
# When we run pipelines, the kfp.compile.Compile.compile() method takes the decorated function with @dsl.pipeline and
|
|
304
|
-
# converts it to a k8s object. As part of the flow in the Compile.compile() method,
|
|
305
|
-
# we call _create_and_write_workflow, which builds a dictionary from the workflow and then writes it to a file.
|
|
306
|
-
# Unfortunately, the kfp sdk does not provide an API for configuring priority_class_name and other attributes.
|
|
307
|
-
# I ran across the following problem when seeking for a method to set the priority_class_name:
|
|
308
|
-
# https://github.com/kubeflow/pipelines/issues/3594
|
|
309
|
-
# When we patch the _create_and_write_workflow, we can eventually obtain the dictionary right before we write it
|
|
310
|
-
# to a file and enrich it with argo compatible fields, make sure you looking for the same argo version we use
|
|
311
|
-
# https://github.com/argoproj/argo-workflows/blob/release-2.7/pkg/apis/workflow/v1alpha1/workflow_types.go
|
|
312
|
-
def _create_enriched_mlrun_workflow(
|
|
313
|
-
self,
|
|
314
|
-
pipeline_func: typing.Callable,
|
|
315
|
-
pipeline_name: typing.Optional[str] = None,
|
|
316
|
-
pipeline_description: typing.Optional[str] = None,
|
|
317
|
-
params_list: typing.Optional[list[dsl.PipelineParam]] = None,
|
|
318
|
-
pipeline_conf: typing.Optional[dsl.PipelineConf] = None,
|
|
319
|
-
):
|
|
320
|
-
"""Call internal implementation of create_workflow and enrich with mlrun functions attributes"""
|
|
321
|
-
workflow = self._original_create_workflow(
|
|
322
|
-
pipeline_func, pipeline_name, pipeline_description, params_list, pipeline_conf
|
|
323
|
-
)
|
|
324
|
-
# We don't want to interrupt the original flow and don't know all the scenarios the function could be called.
|
|
325
|
-
# that's why we have try/except on all the code of the enrichment and also specific try/except for errors that
|
|
326
|
-
# we know can be raised.
|
|
327
|
-
try:
|
|
328
|
-
functions = []
|
|
329
|
-
if pipeline_context.functions:
|
|
330
|
-
try:
|
|
331
|
-
functions = pipeline_context.functions.values()
|
|
332
|
-
except Exception as err:
|
|
333
|
-
logger.debug(
|
|
334
|
-
"Unable to retrieve project functions, not enriching workflow with mlrun",
|
|
335
|
-
error=err_to_str(err),
|
|
336
|
-
)
|
|
337
|
-
return workflow
|
|
338
|
-
|
|
339
|
-
# enrich each pipeline step with your desire k8s attribute
|
|
340
|
-
for kfp_step_template in workflow["spec"]["templates"]:
|
|
341
|
-
if kfp_step_template.get("container"):
|
|
342
|
-
for function_obj in functions:
|
|
343
|
-
# we condition within each function since the comparison between the function and
|
|
344
|
-
# the kfp pod may change depending on the attribute type.
|
|
345
|
-
_set_function_attribute_on_kfp_pod(
|
|
346
|
-
kfp_step_template,
|
|
347
|
-
function_obj,
|
|
348
|
-
"PriorityClassName",
|
|
349
|
-
"priority_class_name",
|
|
350
|
-
)
|
|
351
|
-
_enrich_kfp_pod_security_context(
|
|
352
|
-
kfp_step_template,
|
|
353
|
-
function_obj,
|
|
354
|
-
)
|
|
355
|
-
except mlrun.errors.MLRunInvalidArgumentError:
|
|
356
|
-
raise
|
|
357
|
-
except Exception as err:
|
|
358
|
-
logger.debug(
|
|
359
|
-
"Something in the enrichment of kfp pods failed", error=err_to_str(err)
|
|
360
|
-
)
|
|
361
|
-
return workflow
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
# patching function as class method
|
|
365
|
-
kfp.compiler.Compiler._original_create_workflow = kfp.compiler.Compiler._create_workflow
|
|
366
|
-
kfp.compiler.Compiler._create_workflow = _create_enriched_mlrun_workflow
|
|
367
|
-
|
|
368
|
-
|
|
369
309
|
def get_db_function(project, key) -> mlrun.runtimes.BaseRuntime:
|
|
370
310
|
project_instance, name, tag, hash_key = parse_versioned_object_uri(
|
|
371
311
|
key, project.metadata.name
|
|
@@ -375,7 +315,11 @@ def get_db_function(project, key) -> mlrun.runtimes.BaseRuntime:
|
|
|
375
315
|
|
|
376
316
|
|
|
377
317
|
def enrich_function_object(
|
|
378
|
-
project
|
|
318
|
+
project: mlrun.common.schemas.Project,
|
|
319
|
+
function: mlrun.runtimes.BaseRuntime,
|
|
320
|
+
decorator: typing.Callable = None,
|
|
321
|
+
copy_function: bool = True,
|
|
322
|
+
try_auto_mount: bool = True,
|
|
379
323
|
) -> mlrun.runtimes.BaseRuntime:
|
|
380
324
|
if hasattr(function, "_enriched"):
|
|
381
325
|
return function
|
|
@@ -412,6 +356,10 @@ def enrich_function_object(
|
|
|
412
356
|
if decorator:
|
|
413
357
|
decorator(f)
|
|
414
358
|
|
|
359
|
+
if project.spec.default_function_node_selector:
|
|
360
|
+
f.enrich_runtime_spec(
|
|
361
|
+
project.spec.default_function_node_selector,
|
|
362
|
+
)
|
|
415
363
|
if try_auto_mount:
|
|
416
364
|
if (
|
|
417
365
|
decorator and AutoMountType.is_auto_modifier(decorator)
|
|
@@ -431,7 +379,7 @@ class _PipelineRunStatus:
|
|
|
431
379
|
engine: type["_PipelineRunner"],
|
|
432
380
|
project: "mlrun.projects.MlrunProject",
|
|
433
381
|
workflow: WorkflowSpec = None,
|
|
434
|
-
state:
|
|
382
|
+
state: mlrun_pipelines.common.models.RunStatuses = "",
|
|
435
383
|
exc: Exception = None,
|
|
436
384
|
):
|
|
437
385
|
"""
|
|
@@ -451,7 +399,10 @@ class _PipelineRunStatus:
|
|
|
451
399
|
|
|
452
400
|
@property
|
|
453
401
|
def state(self):
|
|
454
|
-
if
|
|
402
|
+
if (
|
|
403
|
+
self._state
|
|
404
|
+
not in mlrun_pipelines.common.models.RunStatuses.stable_statuses()
|
|
405
|
+
):
|
|
455
406
|
self._state = self._engine.get_state(self.run_id, self.project)
|
|
456
407
|
return self._state
|
|
457
408
|
|
|
@@ -460,12 +411,15 @@ class _PipelineRunStatus:
|
|
|
460
411
|
return self._exc
|
|
461
412
|
|
|
462
413
|
def wait_for_completion(self, timeout=None, expected_statuses=None):
|
|
463
|
-
|
|
464
|
-
self
|
|
414
|
+
returned_state = self._engine.wait_for_completion(
|
|
415
|
+
self,
|
|
465
416
|
project=self.project,
|
|
466
417
|
timeout=timeout,
|
|
467
418
|
expected_statuses=expected_statuses,
|
|
468
419
|
)
|
|
420
|
+
# TODO: returning a state is optional until all runners implement wait_for_completion
|
|
421
|
+
if returned_state:
|
|
422
|
+
self._state = returned_state
|
|
469
423
|
return self._state
|
|
470
424
|
|
|
471
425
|
def __str__(self):
|
|
@@ -505,7 +459,12 @@ class _PipelineRunner(abc.ABC):
|
|
|
505
459
|
|
|
506
460
|
@staticmethod
|
|
507
461
|
@abc.abstractmethod
|
|
508
|
-
def wait_for_completion(
|
|
462
|
+
def wait_for_completion(
|
|
463
|
+
run: "_PipelineRunStatus",
|
|
464
|
+
project: typing.Optional["mlrun.projects.MlrunProject"] = None,
|
|
465
|
+
timeout: typing.Optional[int] = None,
|
|
466
|
+
expected_statuses: list[str] = None,
|
|
467
|
+
):
|
|
509
468
|
pass
|
|
510
469
|
|
|
511
470
|
@staticmethod
|
|
@@ -513,10 +472,52 @@ class _PipelineRunner(abc.ABC):
|
|
|
513
472
|
def get_state(run_id, project=None):
|
|
514
473
|
pass
|
|
515
474
|
|
|
475
|
+
@staticmethod
|
|
476
|
+
def get_run_status(
|
|
477
|
+
project,
|
|
478
|
+
run: _PipelineRunStatus,
|
|
479
|
+
timeout=None,
|
|
480
|
+
expected_statuses=None,
|
|
481
|
+
notifiers: mlrun.utils.notifications.CustomNotificationPusher = None,
|
|
482
|
+
**kwargs,
|
|
483
|
+
):
|
|
484
|
+
timeout = timeout or 60 * 60
|
|
485
|
+
raise_error = None
|
|
486
|
+
state = ""
|
|
487
|
+
try:
|
|
488
|
+
if timeout:
|
|
489
|
+
state = run.wait_for_completion(
|
|
490
|
+
timeout=timeout, expected_statuses=expected_statuses
|
|
491
|
+
)
|
|
492
|
+
except RuntimeError as exc:
|
|
493
|
+
# push runs table also when we have errors
|
|
494
|
+
raise_error = exc
|
|
495
|
+
|
|
496
|
+
mldb = mlrun.db.get_run_db(secrets=project._secrets)
|
|
497
|
+
runs = mldb.list_runs(project=project.name, labels=f"workflow={run.run_id}")
|
|
498
|
+
|
|
499
|
+
# TODO: The below section duplicates notifiers.push_pipeline_run_results() logic. We should use it instead.
|
|
500
|
+
errors_counter = 0
|
|
501
|
+
for r in runs:
|
|
502
|
+
if r["status"].get("state", "") == "error":
|
|
503
|
+
errors_counter += 1
|
|
504
|
+
|
|
505
|
+
text = _PipelineRunner._generate_workflow_finished_message(
|
|
506
|
+
run.run_id, errors_counter, run._state
|
|
507
|
+
)
|
|
508
|
+
|
|
509
|
+
notifiers = notifiers or project.notifiers
|
|
510
|
+
if notifiers:
|
|
511
|
+
notifiers.push(text, "info", runs)
|
|
512
|
+
|
|
513
|
+
if raise_error:
|
|
514
|
+
raise raise_error
|
|
515
|
+
return state or run._state, errors_counter, text
|
|
516
|
+
|
|
516
517
|
@staticmethod
|
|
517
518
|
def _get_handler(workflow_handler, workflow_spec, project, secrets):
|
|
518
519
|
if not (workflow_handler and callable(workflow_handler)):
|
|
519
|
-
workflow_file = workflow_spec.get_source_file(project.spec.
|
|
520
|
+
workflow_file = workflow_spec.get_source_file(project.spec.get_code_path())
|
|
520
521
|
workflow_handler = create_pipeline(
|
|
521
522
|
project,
|
|
522
523
|
workflow_file,
|
|
@@ -529,15 +530,13 @@ class _PipelineRunner(abc.ABC):
|
|
|
529
530
|
return workflow_handler
|
|
530
531
|
|
|
531
532
|
@staticmethod
|
|
532
|
-
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
|
|
538
|
-
|
|
539
|
-
):
|
|
540
|
-
pass
|
|
533
|
+
def _generate_workflow_finished_message(run_id, errors_counter, state):
|
|
534
|
+
text = f"Workflow {run_id} finished"
|
|
535
|
+
if errors_counter:
|
|
536
|
+
text += f" with {errors_counter} errors"
|
|
537
|
+
if state:
|
|
538
|
+
text += f", state={state}"
|
|
539
|
+
return text
|
|
541
540
|
|
|
542
541
|
|
|
543
542
|
class _KFPRunner(_PipelineRunner):
|
|
@@ -548,7 +547,7 @@ class _KFPRunner(_PipelineRunner):
|
|
|
548
547
|
@classmethod
|
|
549
548
|
def save(cls, project, workflow_spec: WorkflowSpec, target, artifact_path=None):
|
|
550
549
|
pipeline_context.set(project, workflow_spec)
|
|
551
|
-
workflow_file = workflow_spec.get_source_file(project.spec.
|
|
550
|
+
workflow_file = workflow_spec.get_source_file(project.spec.get_code_path())
|
|
552
551
|
functions = FunctionsDict(project)
|
|
553
552
|
pipeline = create_pipeline(
|
|
554
553
|
project,
|
|
@@ -556,13 +555,14 @@ class _KFPRunner(_PipelineRunner):
|
|
|
556
555
|
functions,
|
|
557
556
|
secrets=project._secrets,
|
|
558
557
|
)
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
conf = new_pipe_metadata(
|
|
562
|
-
artifact_path=artifact_path,
|
|
558
|
+
mlrun_pipelines.utils.compile_pipeline(
|
|
559
|
+
artifact_path=artifact_path or project.spec.artifact_path,
|
|
563
560
|
cleanup_ttl=workflow_spec.cleanup_ttl,
|
|
561
|
+
ops=None,
|
|
562
|
+
pipeline=pipeline,
|
|
563
|
+
pipe_file=target,
|
|
564
|
+
type_check=True,
|
|
564
565
|
)
|
|
565
|
-
compiler.Compiler().compile(pipeline, target, pipeline_conf=conf)
|
|
566
566
|
workflow_spec.clear_tmp()
|
|
567
567
|
pipeline_context.clear()
|
|
568
568
|
|
|
@@ -593,12 +593,13 @@ class _KFPRunner(_PipelineRunner):
|
|
|
593
593
|
logger.warning(
|
|
594
594
|
"Setting notifications on kfp pipeline runner uses old notification behavior. "
|
|
595
595
|
"Notifications will only be sent if you wait for pipeline completion. "
|
|
596
|
-
"
|
|
596
|
+
"Some of the features (like setting message or severity level) are not supported."
|
|
597
597
|
)
|
|
598
|
-
for notification
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
)
|
|
598
|
+
# for start message, fallback to old notification behavior
|
|
599
|
+
for notification in notifications or []:
|
|
600
|
+
params = notification.params
|
|
601
|
+
params.update(notification.secret_params)
|
|
602
|
+
project.notifiers.add_notification(notification.kind, params)
|
|
602
603
|
|
|
603
604
|
run_id = _run_pipeline(
|
|
604
605
|
workflow_handler,
|
|
@@ -608,6 +609,7 @@ class _KFPRunner(_PipelineRunner):
|
|
|
608
609
|
namespace=namespace,
|
|
609
610
|
artifact_path=artifact_path,
|
|
610
611
|
cleanup_ttl=workflow_spec.cleanup_ttl,
|
|
612
|
+
timeout=int(mlrun.mlconf.workflows.timeouts.kfp),
|
|
611
613
|
)
|
|
612
614
|
|
|
613
615
|
# The user provided workflow code might have made changes to function specs that require cleanup
|
|
@@ -625,7 +627,6 @@ class _KFPRunner(_PipelineRunner):
|
|
|
625
627
|
func_name=func.metadata.name,
|
|
626
628
|
exc_info=err_to_str(exc),
|
|
627
629
|
)
|
|
628
|
-
|
|
629
630
|
project.notifiers.push_pipeline_start_message(
|
|
630
631
|
project.metadata.name,
|
|
631
632
|
project.get_param("commit_id", None),
|
|
@@ -636,12 +637,21 @@ class _KFPRunner(_PipelineRunner):
|
|
|
636
637
|
return _PipelineRunStatus(run_id, cls, project=project, workflow=workflow_spec)
|
|
637
638
|
|
|
638
639
|
@staticmethod
|
|
639
|
-
def wait_for_completion(
|
|
640
|
-
|
|
641
|
-
|
|
640
|
+
def wait_for_completion(
|
|
641
|
+
run: "_PipelineRunStatus",
|
|
642
|
+
project: typing.Optional["mlrun.projects.MlrunProject"] = None,
|
|
643
|
+
timeout: typing.Optional[int] = None,
|
|
644
|
+
expected_statuses: list[str] = None,
|
|
645
|
+
):
|
|
642
646
|
project_name = project.metadata.name if project else ""
|
|
647
|
+
logger.info(
|
|
648
|
+
"Waiting for pipeline run completion",
|
|
649
|
+
run_id=run.run_id,
|
|
650
|
+
project=project_name,
|
|
651
|
+
)
|
|
652
|
+
timeout = timeout or 60 * 60
|
|
643
653
|
run_info = wait_for_pipeline_completion(
|
|
644
|
-
run_id,
|
|
654
|
+
run.run_id,
|
|
645
655
|
timeout=timeout,
|
|
646
656
|
expected_statuses=expected_statuses,
|
|
647
657
|
project=project_name,
|
|
@@ -659,50 +669,6 @@ class _KFPRunner(_PipelineRunner):
|
|
|
659
669
|
return resp["run"].get("status", "")
|
|
660
670
|
return ""
|
|
661
671
|
|
|
662
|
-
@staticmethod
|
|
663
|
-
def get_run_status(
|
|
664
|
-
project,
|
|
665
|
-
run,
|
|
666
|
-
timeout=None,
|
|
667
|
-
expected_statuses=None,
|
|
668
|
-
notifiers: mlrun.utils.notifications.CustomNotificationPusher = None,
|
|
669
|
-
):
|
|
670
|
-
if timeout is None:
|
|
671
|
-
timeout = 60 * 60
|
|
672
|
-
state = ""
|
|
673
|
-
raise_error = None
|
|
674
|
-
try:
|
|
675
|
-
if timeout:
|
|
676
|
-
logger.info("Waiting for pipeline run completion")
|
|
677
|
-
state = run.wait_for_completion(
|
|
678
|
-
timeout=timeout, expected_statuses=expected_statuses
|
|
679
|
-
)
|
|
680
|
-
except RuntimeError as exc:
|
|
681
|
-
# push runs table also when we have errors
|
|
682
|
-
raise_error = exc
|
|
683
|
-
|
|
684
|
-
mldb = mlrun.db.get_run_db(secrets=project._secrets)
|
|
685
|
-
runs = mldb.list_runs(project=project.name, labels=f"workflow={run.run_id}")
|
|
686
|
-
|
|
687
|
-
# TODO: The below section duplicates notifiers.push_pipeline_run_results() logic. We should use it instead.
|
|
688
|
-
had_errors = 0
|
|
689
|
-
for r in runs:
|
|
690
|
-
if r["status"].get("state", "") == "error":
|
|
691
|
-
had_errors += 1
|
|
692
|
-
|
|
693
|
-
text = f"Workflow {run.run_id} finished"
|
|
694
|
-
if had_errors:
|
|
695
|
-
text += f" with {had_errors} errors"
|
|
696
|
-
if state:
|
|
697
|
-
text += f", state={state}"
|
|
698
|
-
|
|
699
|
-
notifiers = notifiers or project.notifiers
|
|
700
|
-
notifiers.push(text, "info", runs)
|
|
701
|
-
|
|
702
|
-
if raise_error:
|
|
703
|
-
raise raise_error
|
|
704
|
-
return state, had_errors, text
|
|
705
|
-
|
|
706
672
|
|
|
707
673
|
class _LocalRunner(_PipelineRunner):
|
|
708
674
|
"""local pipelines runner"""
|
|
@@ -741,13 +707,14 @@ class _LocalRunner(_PipelineRunner):
|
|
|
741
707
|
original_source = project.spec.source
|
|
742
708
|
project.set_source(source=source)
|
|
743
709
|
pipeline_context.workflow_artifact_path = artifact_path
|
|
710
|
+
|
|
744
711
|
project.notifiers.push_pipeline_start_message(
|
|
745
712
|
project.metadata.name, pipeline_id=workflow_id
|
|
746
713
|
)
|
|
747
714
|
err = None
|
|
748
715
|
try:
|
|
749
716
|
workflow_handler(**workflow_spec.args)
|
|
750
|
-
state =
|
|
717
|
+
state = mlrun_pipelines.common.models.RunStatuses.succeeded
|
|
751
718
|
except Exception as exc:
|
|
752
719
|
err = exc
|
|
753
720
|
logger.exception("Workflow run failed")
|
|
@@ -755,7 +722,7 @@ class _LocalRunner(_PipelineRunner):
|
|
|
755
722
|
f":x: Workflow {workflow_id} run failed!, error: {err_to_str(exc)}",
|
|
756
723
|
mlrun.common.schemas.NotificationSeverity.ERROR,
|
|
757
724
|
)
|
|
758
|
-
state =
|
|
725
|
+
state = mlrun_pipelines.common.models.RunStatuses.failed
|
|
759
726
|
mlrun.run.wait_for_runs_completion(pipeline_context.runs_map.values())
|
|
760
727
|
project.notifiers.push_pipeline_run_results(
|
|
761
728
|
pipeline_context.runs_map.values(), state=state
|
|
@@ -779,17 +746,10 @@ class _LocalRunner(_PipelineRunner):
|
|
|
779
746
|
return ""
|
|
780
747
|
|
|
781
748
|
@staticmethod
|
|
782
|
-
def wait_for_completion(
|
|
783
|
-
|
|
784
|
-
|
|
785
|
-
|
|
786
|
-
def get_run_status(
|
|
787
|
-
project,
|
|
788
|
-
run,
|
|
789
|
-
timeout=None,
|
|
790
|
-
expected_statuses=None,
|
|
791
|
-
notifiers: mlrun.utils.notifications.CustomNotificationPusher = None,
|
|
792
|
-
):
|
|
749
|
+
def wait_for_completion(run, project=None, timeout=None, expected_statuses=None):
|
|
750
|
+
# TODO: local runner blocks for the duration of the pipeline.
|
|
751
|
+
# Therefore usually there will be nothing to wait for.
|
|
752
|
+
# However, users may run functions with watch=False and then it can be useful to wait for the runs here.
|
|
793
753
|
pass
|
|
794
754
|
|
|
795
755
|
|
|
@@ -814,10 +774,6 @@ class _RemoteRunner(_PipelineRunner):
|
|
|
814
774
|
workflow_name = normalize_workflow_name(name=name, project_name=project.name)
|
|
815
775
|
workflow_id = None
|
|
816
776
|
|
|
817
|
-
# for start message, fallback to old notification behavior
|
|
818
|
-
for notification in notifications or []:
|
|
819
|
-
project.notifiers.add_notification(notification.kind, notification.params)
|
|
820
|
-
|
|
821
777
|
# The returned engine for this runner is the engine of the workflow.
|
|
822
778
|
# In this way wait_for_completion/get_run_status would be executed by the correct pipeline runner.
|
|
823
779
|
inner_engine = get_workflow_engine(workflow_spec.engine)
|
|
@@ -865,22 +821,44 @@ class _RemoteRunner(_PipelineRunner):
|
|
|
865
821
|
)
|
|
866
822
|
return
|
|
867
823
|
|
|
824
|
+
get_workflow_id_timeout = max(
|
|
825
|
+
int(mlrun.mlconf.workflows.timeouts.remote),
|
|
826
|
+
int(getattr(mlrun.mlconf.workflows.timeouts, inner_engine.engine)),
|
|
827
|
+
)
|
|
828
|
+
|
|
868
829
|
logger.debug(
|
|
869
830
|
"Workflow submitted, waiting for pipeline run to start",
|
|
870
831
|
workflow_name=workflow_response.name,
|
|
832
|
+
get_workflow_id_timeout=get_workflow_id_timeout,
|
|
871
833
|
)
|
|
872
834
|
|
|
835
|
+
def _get_workflow_id_or_bail():
|
|
836
|
+
try:
|
|
837
|
+
return run_db.get_workflow_id(
|
|
838
|
+
project=project.name,
|
|
839
|
+
name=workflow_response.name,
|
|
840
|
+
run_id=workflow_response.run_id,
|
|
841
|
+
engine=workflow_spec.engine,
|
|
842
|
+
)
|
|
843
|
+
except mlrun.errors.MLRunHTTPStatusError as get_wf_exc:
|
|
844
|
+
# fail fast on specific errors
|
|
845
|
+
if get_wf_exc.error_status_code in [
|
|
846
|
+
http.HTTPStatus.PRECONDITION_FAILED
|
|
847
|
+
]:
|
|
848
|
+
raise mlrun.errors.MLRunFatalFailureError(
|
|
849
|
+
original_exception=get_wf_exc
|
|
850
|
+
)
|
|
851
|
+
|
|
852
|
+
# raise for a retry (on other errors)
|
|
853
|
+
raise
|
|
854
|
+
|
|
873
855
|
# Getting workflow id from run:
|
|
874
856
|
response = retry_until_successful(
|
|
875
857
|
1,
|
|
876
|
-
|
|
858
|
+
get_workflow_id_timeout,
|
|
877
859
|
logger,
|
|
878
860
|
False,
|
|
879
|
-
|
|
880
|
-
project=project.name,
|
|
881
|
-
name=workflow_response.name,
|
|
882
|
-
run_id=workflow_response.run_id,
|
|
883
|
-
engine=workflow_spec.engine,
|
|
861
|
+
_get_workflow_id_or_bail,
|
|
884
862
|
)
|
|
885
863
|
workflow_id = response.workflow_id
|
|
886
864
|
# After fetching the workflow_id the workflow executed successfully
|
|
@@ -892,12 +870,9 @@ class _RemoteRunner(_PipelineRunner):
|
|
|
892
870
|
f":x: Workflow {workflow_name} run failed!, error: {err_to_str(exc)}",
|
|
893
871
|
mlrun.common.schemas.NotificationSeverity.ERROR,
|
|
894
872
|
)
|
|
895
|
-
state =
|
|
873
|
+
state = mlrun_pipelines.common.models.RunStatuses.failed
|
|
896
874
|
else:
|
|
897
|
-
state =
|
|
898
|
-
project.notifiers.push_pipeline_start_message(
|
|
899
|
-
project.metadata.name,
|
|
900
|
-
)
|
|
875
|
+
state = mlrun_pipelines.common.models.RunStatuses.running
|
|
901
876
|
pipeline_context.clear()
|
|
902
877
|
return _PipelineRunStatus(
|
|
903
878
|
run_id=workflow_id,
|
|
@@ -911,24 +886,59 @@ class _RemoteRunner(_PipelineRunner):
|
|
|
911
886
|
@staticmethod
|
|
912
887
|
def get_run_status(
|
|
913
888
|
project,
|
|
914
|
-
run,
|
|
889
|
+
run: _PipelineRunStatus,
|
|
915
890
|
timeout=None,
|
|
916
891
|
expected_statuses=None,
|
|
917
892
|
notifiers: mlrun.utils.notifications.CustomNotificationPusher = None,
|
|
893
|
+
inner_engine: type[_PipelineRunner] = None,
|
|
918
894
|
):
|
|
919
|
-
|
|
920
|
-
|
|
921
|
-
|
|
922
|
-
|
|
923
|
-
|
|
924
|
-
|
|
925
|
-
|
|
926
|
-
|
|
927
|
-
|
|
895
|
+
inner_engine = inner_engine or _KFPRunner
|
|
896
|
+
if inner_engine.engine == _KFPRunner.engine:
|
|
897
|
+
# ignore notifiers for remote notifications, as they are handled by the remote pipeline notifications,
|
|
898
|
+
# so overriding with CustomNotificationPusher with empty list of notifiers or only local notifiers
|
|
899
|
+
local_project_notifiers = list(
|
|
900
|
+
set(mlrun.utils.notifications.NotificationTypes.local()).intersection(
|
|
901
|
+
set(project.notifiers.notifications.keys())
|
|
902
|
+
)
|
|
903
|
+
)
|
|
904
|
+
notifiers = mlrun.utils.notifications.CustomNotificationPusher(
|
|
905
|
+
local_project_notifiers
|
|
906
|
+
)
|
|
907
|
+
return _KFPRunner.get_run_status(
|
|
908
|
+
project,
|
|
909
|
+
run,
|
|
910
|
+
timeout,
|
|
911
|
+
expected_statuses,
|
|
912
|
+
notifiers=notifiers,
|
|
913
|
+
)
|
|
914
|
+
|
|
915
|
+
elif inner_engine.engine == _LocalRunner.engine:
|
|
916
|
+
mldb = mlrun.db.get_run_db(secrets=project._secrets)
|
|
917
|
+
pipeline_runner_run = mldb.read_run(run.run_id, project=project.name)
|
|
918
|
+
|
|
919
|
+
pipeline_runner_run = mlrun.run.RunObject.from_dict(pipeline_runner_run)
|
|
920
|
+
|
|
921
|
+
# here we are waiting for the pipeline run to complete and refreshing after that the pipeline run from the
|
|
922
|
+
# db
|
|
923
|
+
# TODO: do it with timeout
|
|
924
|
+
pipeline_runner_run.logs(db=mldb)
|
|
925
|
+
pipeline_runner_run.refresh()
|
|
926
|
+
run._state = mlrun.common.runtimes.constants.RunStates.run_state_to_pipeline_run_status(
|
|
927
|
+
pipeline_runner_run.status.state
|
|
928
|
+
)
|
|
929
|
+
run._exc = pipeline_runner_run.status.error
|
|
930
|
+
return _LocalRunner.get_run_status(
|
|
931
|
+
project,
|
|
932
|
+
run,
|
|
933
|
+
timeout,
|
|
934
|
+
expected_statuses,
|
|
935
|
+
notifiers=notifiers,
|
|
936
|
+
)
|
|
928
937
|
|
|
929
|
-
|
|
930
|
-
|
|
931
|
-
|
|
938
|
+
else:
|
|
939
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
940
|
+
f"Unsupported inner runner engine: {inner_engine.engine}"
|
|
941
|
+
)
|
|
932
942
|
|
|
933
943
|
|
|
934
944
|
def create_pipeline(project, pipeline, functions, secrets=None, handler=None):
|
|
@@ -974,14 +984,25 @@ def github_webhook(request):
|
|
|
974
984
|
return {"msg": "pushed"}
|
|
975
985
|
|
|
976
986
|
|
|
977
|
-
def load_and_run(
|
|
987
|
+
def load_and_run(context, *args, **kwargs):
|
|
988
|
+
"""
|
|
989
|
+
This function serves as an alias to `load_and_run_workflow`,
|
|
990
|
+
allowing to continue using `load_and_run` without modifying existing workflows or exported runs.
|
|
991
|
+
This approach ensures backward compatibility,
|
|
992
|
+
while directing all new calls to the updated `load_and_run_workflow` function.
|
|
993
|
+
"""
|
|
994
|
+
kwargs.pop("load_only", None)
|
|
995
|
+
kwargs.pop("save", None)
|
|
996
|
+
load_and_run_workflow(context, *args, **kwargs)
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
def load_and_run_workflow(
|
|
978
1000
|
context: mlrun.execution.MLClientCtx,
|
|
979
1001
|
url: str = None,
|
|
980
1002
|
project_name: str = "",
|
|
981
1003
|
init_git: bool = None,
|
|
982
1004
|
subpath: str = None,
|
|
983
1005
|
clone: bool = False,
|
|
984
|
-
save: bool = True,
|
|
985
1006
|
workflow_name: str = None,
|
|
986
1007
|
workflow_path: str = None,
|
|
987
1008
|
workflow_arguments: dict[str, typing.Any] = None,
|
|
@@ -994,14 +1015,12 @@ def load_and_run(
|
|
|
994
1015
|
local: bool = None,
|
|
995
1016
|
schedule: typing.Union[str, mlrun.common.schemas.ScheduleCronTrigger] = None,
|
|
996
1017
|
cleanup_ttl: int = None,
|
|
997
|
-
load_only: bool = False,
|
|
998
1018
|
wait_for_completion: bool = False,
|
|
999
1019
|
project_context: str = None,
|
|
1000
1020
|
):
|
|
1001
1021
|
"""
|
|
1002
1022
|
Auxiliary function that the RemoteRunner run once or run every schedule.
|
|
1003
1023
|
This function loads a project from a given remote source and then runs the workflow.
|
|
1004
|
-
|
|
1005
1024
|
:param context: mlrun context.
|
|
1006
1025
|
:param url: remote url that represents the project's source.
|
|
1007
1026
|
See 'mlrun.load_project()' for details
|
|
@@ -1009,7 +1028,6 @@ def load_and_run(
|
|
|
1009
1028
|
:param init_git: if True, will git init the context dir
|
|
1010
1029
|
:param subpath: project subpath (within the archive)
|
|
1011
1030
|
:param clone: if True, always clone (delete any existing content)
|
|
1012
|
-
:param save: whether to save the created project and artifact in the DB
|
|
1013
1031
|
:param workflow_name: name of the workflow
|
|
1014
1032
|
:param workflow_path: url to a workflow file, if not a project workflow
|
|
1015
1033
|
:param workflow_arguments: kubeflow pipelines arguments (parameters)
|
|
@@ -1025,48 +1043,38 @@ def load_and_run(
|
|
|
1025
1043
|
:param schedule: ScheduleCronTrigger class instance or a standard crontab expression string
|
|
1026
1044
|
:param cleanup_ttl: pipeline cleanup ttl in secs (time to wait after workflow completion, at which point the
|
|
1027
1045
|
workflow and all its resources are deleted)
|
|
1028
|
-
:param load_only: for just loading the project, inner use.
|
|
1029
1046
|
:param wait_for_completion: wait for workflow completion before returning
|
|
1030
1047
|
:param project_context: project context path (used for loading the project)
|
|
1031
1048
|
"""
|
|
1032
|
-
|
|
1033
|
-
|
|
1034
|
-
|
|
1035
|
-
|
|
1036
|
-
|
|
1037
|
-
|
|
1038
|
-
|
|
1039
|
-
|
|
1040
|
-
|
|
1041
|
-
|
|
1042
|
-
|
|
1043
|
-
|
|
1044
|
-
|
|
1045
|
-
|
|
1046
|
-
["slack"]
|
|
1047
|
-
)
|
|
1048
|
-
url = get_ui_url(project_name, context.uid)
|
|
1049
|
-
link = f"<{url}|*view workflow job details*>"
|
|
1050
|
-
message = (
|
|
1051
|
-
f":x: Failed to run scheduled workflow {workflow_name} in Project {project_name} !\n"
|
|
1052
|
-
f"error: ```{error}```\n{link}"
|
|
1053
|
-
)
|
|
1054
|
-
# Sending Slack Notification without losing the original error:
|
|
1055
|
-
try:
|
|
1056
|
-
notification_pusher.push(
|
|
1057
|
-
message=message,
|
|
1058
|
-
severity=mlrun.common.schemas.NotificationSeverity.ERROR,
|
|
1059
|
-
)
|
|
1060
|
-
|
|
1061
|
-
except Exception as exc:
|
|
1062
|
-
logger.error("Failed to send slack notification", exc=exc)
|
|
1063
|
-
|
|
1064
|
-
raise error
|
|
1049
|
+
project_context = project_context or f"./{project_name}"
|
|
1050
|
+
|
|
1051
|
+
# Load the project to fetch files which the runner needs, such as remote source files
|
|
1052
|
+
pull_remote_project_files(
|
|
1053
|
+
context=context,
|
|
1054
|
+
project_context=project_context,
|
|
1055
|
+
url=url,
|
|
1056
|
+
project_name=project_name,
|
|
1057
|
+
init_git=init_git,
|
|
1058
|
+
subpath=subpath,
|
|
1059
|
+
clone=clone,
|
|
1060
|
+
schedule=schedule,
|
|
1061
|
+
workflow_name=workflow_name,
|
|
1062
|
+
)
|
|
1065
1063
|
|
|
1066
|
-
|
|
1064
|
+
# Retrieve the project object:
|
|
1065
|
+
# - If the project exists in the MLRun database, it will be loaded from there.
|
|
1066
|
+
# - If it doesn't exist in the database, it will be created from the previously loaded local directory.
|
|
1067
|
+
project = mlrun.get_or_create_project(
|
|
1068
|
+
context=project_context or f"./{project_name}",
|
|
1069
|
+
name=project_name,
|
|
1070
|
+
)
|
|
1067
1071
|
|
|
1068
|
-
if
|
|
1069
|
-
|
|
1072
|
+
# extract "start" notification if exists
|
|
1073
|
+
start_notifications = [
|
|
1074
|
+
notification
|
|
1075
|
+
for notification in context.get_notifications(unmask_secret_params=True)
|
|
1076
|
+
if "running" in notification.when
|
|
1077
|
+
]
|
|
1070
1078
|
|
|
1071
1079
|
workflow_log_message = workflow_name or workflow_path
|
|
1072
1080
|
context.logger.info(f"Running workflow {workflow_log_message} from remote")
|
|
@@ -1083,26 +1091,165 @@ def load_and_run(
|
|
|
1083
1091
|
cleanup_ttl=cleanup_ttl,
|
|
1084
1092
|
engine=engine,
|
|
1085
1093
|
local=local,
|
|
1094
|
+
notifications=start_notifications,
|
|
1086
1095
|
)
|
|
1087
1096
|
context.log_result(key="workflow_id", value=run.run_id)
|
|
1088
1097
|
context.log_result(key="engine", value=run._engine.engine, commit=True)
|
|
1089
1098
|
|
|
1090
|
-
if run.state ==
|
|
1099
|
+
if run.state == mlrun_pipelines.common.models.RunStatuses.failed:
|
|
1091
1100
|
raise RuntimeError(f"Workflow {workflow_log_message} failed") from run.exc
|
|
1092
1101
|
|
|
1093
1102
|
if wait_for_completion:
|
|
1103
|
+
handle_workflow_completion(
|
|
1104
|
+
run=run,
|
|
1105
|
+
project=project,
|
|
1106
|
+
context=context,
|
|
1107
|
+
workflow_log_message=workflow_log_message,
|
|
1108
|
+
)
|
|
1109
|
+
|
|
1110
|
+
|
|
1111
|
+
def pull_remote_project_files(
|
|
1112
|
+
context: mlrun.execution.MLClientCtx,
|
|
1113
|
+
project_context: str,
|
|
1114
|
+
url: str,
|
|
1115
|
+
project_name: str,
|
|
1116
|
+
init_git: typing.Optional[bool],
|
|
1117
|
+
subpath: typing.Optional[str],
|
|
1118
|
+
clone: bool,
|
|
1119
|
+
schedule: typing.Optional[
|
|
1120
|
+
typing.Union[str, mlrun.common.schemas.ScheduleCronTrigger]
|
|
1121
|
+
],
|
|
1122
|
+
workflow_name: typing.Optional[str],
|
|
1123
|
+
) -> None:
|
|
1124
|
+
"""
|
|
1125
|
+
Load the project to clone remote files if they exist.
|
|
1126
|
+
If an exception occurs during project loading, send a notification if the workflow is scheduled.
|
|
1127
|
+
|
|
1128
|
+
:param context: MLRun execution context.
|
|
1129
|
+
:param project_context: Path to the project context.
|
|
1130
|
+
:param url: URL of the project repository.
|
|
1131
|
+
:param project_name: Name of the project.
|
|
1132
|
+
:param init_git: Initialize a git repository.
|
|
1133
|
+
:param subpath: Project subpath within the repository.
|
|
1134
|
+
:param clone: Whether to clone the repository.
|
|
1135
|
+
:param schedule: Schedule for running the workflow.
|
|
1136
|
+
:param workflow_name: Name of the workflow to run.
|
|
1137
|
+
"""
|
|
1138
|
+
try:
|
|
1139
|
+
# Load the project to clone remote files if they exist.
|
|
1140
|
+
# Using save=False to avoid overriding changes from the database if it already exists.
|
|
1141
|
+
mlrun.load_project(
|
|
1142
|
+
context=project_context,
|
|
1143
|
+
url=url,
|
|
1144
|
+
name=project_name,
|
|
1145
|
+
init_git=init_git,
|
|
1146
|
+
subpath=subpath,
|
|
1147
|
+
clone=clone,
|
|
1148
|
+
save=False,
|
|
1149
|
+
)
|
|
1150
|
+
except Exception as error:
|
|
1151
|
+
notify_scheduled_workflow_failure(
|
|
1152
|
+
schedule=schedule,
|
|
1153
|
+
project_name=project_name,
|
|
1154
|
+
workflow_name=workflow_name,
|
|
1155
|
+
error=error,
|
|
1156
|
+
context_uid=context.uid,
|
|
1157
|
+
)
|
|
1158
|
+
raise error
|
|
1159
|
+
|
|
1160
|
+
|
|
1161
|
+
def notify_scheduled_workflow_failure(
|
|
1162
|
+
schedule,
|
|
1163
|
+
project_name: str,
|
|
1164
|
+
workflow_name: str,
|
|
1165
|
+
error: Exception,
|
|
1166
|
+
context_uid: str,
|
|
1167
|
+
) -> None:
|
|
1168
|
+
if schedule:
|
|
1169
|
+
notification_pusher = mlrun.utils.notifications.CustomNotificationPusher(
|
|
1170
|
+
["slack"]
|
|
1171
|
+
)
|
|
1172
|
+
url = get_ui_url(project_name, context_uid)
|
|
1173
|
+
link = f"<{url}|*view workflow job details*>"
|
|
1174
|
+
message = (
|
|
1175
|
+
f":x: Failed to run scheduled workflow {workflow_name} "
|
|
1176
|
+
f"in Project {project_name}!\n"
|
|
1177
|
+
f"Error: ```{err_to_str(error)}```\n{link}"
|
|
1178
|
+
)
|
|
1179
|
+
# Sending Slack Notification without losing the original error:
|
|
1094
1180
|
try:
|
|
1095
|
-
|
|
1096
|
-
|
|
1097
|
-
|
|
1098
|
-
"Failed waiting for workflow completion",
|
|
1099
|
-
workflow=workflow_log_message,
|
|
1100
|
-
exc=err_to_str(exc),
|
|
1181
|
+
notification_pusher.push(
|
|
1182
|
+
message=message,
|
|
1183
|
+
severity=mlrun.common.schemas.NotificationSeverity.ERROR,
|
|
1101
1184
|
)
|
|
1102
1185
|
|
|
1103
|
-
|
|
1104
|
-
|
|
1105
|
-
|
|
1106
|
-
|
|
1107
|
-
|
|
1108
|
-
|
|
1186
|
+
except Exception as exc:
|
|
1187
|
+
logger.error("Failed to send slack notification", exc=err_to_str(exc))
|
|
1188
|
+
|
|
1189
|
+
|
|
1190
|
+
def handle_workflow_completion(
|
|
1191
|
+
run: _PipelineRunStatus,
|
|
1192
|
+
project,
|
|
1193
|
+
context: mlrun.execution.MLClientCtx,
|
|
1194
|
+
workflow_log_message: str,
|
|
1195
|
+
) -> None:
|
|
1196
|
+
"""
|
|
1197
|
+
Handle workflow completion by waiting for it to finish and logging the final state.
|
|
1198
|
+
|
|
1199
|
+
:param run: Run object containing workflow execution details.
|
|
1200
|
+
:param project: MLRun project object.
|
|
1201
|
+
:param context: MLRun execution context.
|
|
1202
|
+
:param workflow_log_message: Message used for logging.
|
|
1203
|
+
"""
|
|
1204
|
+
try:
|
|
1205
|
+
run.wait_for_completion()
|
|
1206
|
+
except Exception as exc:
|
|
1207
|
+
mlrun.utils.logger.error(
|
|
1208
|
+
"Failed waiting for workflow completion",
|
|
1209
|
+
workflow=workflow_log_message,
|
|
1210
|
+
exc=err_to_str(exc),
|
|
1211
|
+
)
|
|
1212
|
+
|
|
1213
|
+
pipeline_state, _, _ = project.get_run_status(run)
|
|
1214
|
+
context.log_result(key="workflow_state", value=pipeline_state, commit=True)
|
|
1215
|
+
if pipeline_state != mlrun_pipelines.common.models.RunStatuses.succeeded:
|
|
1216
|
+
raise RuntimeError(
|
|
1217
|
+
f"Workflow {workflow_log_message} failed, state={pipeline_state}"
|
|
1218
|
+
)
|
|
1219
|
+
|
|
1220
|
+
|
|
1221
|
+
def import_remote_project(
|
|
1222
|
+
context: mlrun.execution.MLClientCtx,
|
|
1223
|
+
url: str = None,
|
|
1224
|
+
project_name: str = "",
|
|
1225
|
+
init_git: bool = None,
|
|
1226
|
+
subpath: str = None,
|
|
1227
|
+
clone: bool = False,
|
|
1228
|
+
save: bool = True,
|
|
1229
|
+
project_context: str = None,
|
|
1230
|
+
):
|
|
1231
|
+
"""
|
|
1232
|
+
This function loads a project from a given remote source.
|
|
1233
|
+
|
|
1234
|
+
:param context: mlrun context.
|
|
1235
|
+
:param url: remote url that represents the project's source.
|
|
1236
|
+
See 'mlrun.load_project()' for details
|
|
1237
|
+
:param project_name: project name
|
|
1238
|
+
:param init_git: if True, will git init the context dir
|
|
1239
|
+
:param subpath: project subpath (within the archive)
|
|
1240
|
+
:param clone: if True, always clone (delete any existing content)
|
|
1241
|
+
:param save: whether to save the created project and artifact in the DB
|
|
1242
|
+
:param project_context: project context path (used for loading the project)
|
|
1243
|
+
"""
|
|
1244
|
+
project = mlrun.load_project(
|
|
1245
|
+
context=project_context or f"./{project_name}",
|
|
1246
|
+
url=url,
|
|
1247
|
+
name=project_name,
|
|
1248
|
+
init_git=init_git,
|
|
1249
|
+
subpath=subpath,
|
|
1250
|
+
clone=clone,
|
|
1251
|
+
save=save,
|
|
1252
|
+
sync_functions=True,
|
|
1253
|
+
)
|
|
1254
|
+
|
|
1255
|
+
context.logger.info(f"Loaded project {project.name} successfully")
|