mlrun 1.10.0rc11__py3-none-any.whl → 1.10.0rc13__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +2 -1
- mlrun/__main__.py +7 -1
- mlrun/artifacts/base.py +9 -3
- mlrun/artifacts/dataset.py +2 -1
- mlrun/artifacts/llm_prompt.py +6 -2
- mlrun/artifacts/model.py +2 -2
- mlrun/common/constants.py +1 -0
- mlrun/common/runtimes/constants.py +10 -1
- mlrun/common/schemas/__init__.py +1 -1
- mlrun/common/schemas/model_monitoring/model_endpoints.py +1 -1
- mlrun/common/schemas/serving.py +7 -0
- mlrun/config.py +21 -2
- mlrun/datastore/__init__.py +3 -1
- mlrun/datastore/alibaba_oss.py +1 -1
- mlrun/datastore/azure_blob.py +1 -1
- mlrun/datastore/base.py +6 -31
- mlrun/datastore/datastore.py +109 -33
- mlrun/datastore/datastore_profile.py +31 -0
- mlrun/datastore/dbfs_store.py +1 -1
- mlrun/datastore/google_cloud_storage.py +2 -2
- mlrun/datastore/model_provider/__init__.py +13 -0
- mlrun/datastore/model_provider/model_provider.py +160 -0
- mlrun/datastore/model_provider/openai_provider.py +144 -0
- mlrun/datastore/remote_client.py +65 -0
- mlrun/datastore/s3.py +1 -1
- mlrun/datastore/storeytargets.py +1 -1
- mlrun/datastore/utils.py +22 -0
- mlrun/datastore/v3io.py +1 -1
- mlrun/db/base.py +1 -1
- mlrun/db/httpdb.py +9 -4
- mlrun/db/nopdb.py +1 -1
- mlrun/execution.py +28 -7
- mlrun/launcher/base.py +23 -13
- mlrun/launcher/local.py +3 -1
- mlrun/launcher/remote.py +4 -2
- mlrun/model.py +65 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +175 -8
- mlrun/package/packagers_manager.py +2 -0
- mlrun/projects/operations.py +8 -1
- mlrun/projects/pipelines.py +40 -18
- mlrun/projects/project.py +28 -5
- mlrun/run.py +42 -2
- mlrun/runtimes/__init__.py +6 -0
- mlrun/runtimes/base.py +24 -6
- mlrun/runtimes/daskjob.py +1 -0
- mlrun/runtimes/databricks_job/databricks_runtime.py +1 -0
- mlrun/runtimes/local.py +1 -6
- mlrun/serving/server.py +1 -2
- mlrun/serving/states.py +438 -23
- mlrun/serving/system_steps.py +27 -29
- mlrun/utils/helpers.py +13 -2
- mlrun/utils/notifications/notification_pusher.py +15 -0
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/METADATA +2 -2
- {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/RECORD +59 -55
- {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/WHEEL +0 -0
- {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/entry_points.txt +0 -0
- {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/licenses/LICENSE +0 -0
- {mlrun-1.10.0rc11.dist-info → mlrun-1.10.0rc13.dist-info}/top_level.txt +0 -0
mlrun/execution.py
CHANGED
|
@@ -26,6 +26,7 @@ from dateutil import parser
|
|
|
26
26
|
import mlrun
|
|
27
27
|
import mlrun.common.constants as mlrun_constants
|
|
28
28
|
import mlrun.common.formatters
|
|
29
|
+
import mlrun.common.runtimes.constants
|
|
29
30
|
from mlrun.artifacts import (
|
|
30
31
|
Artifact,
|
|
31
32
|
DatasetArtifact,
|
|
@@ -91,6 +92,8 @@ class MLClientCtx:
|
|
|
91
92
|
self._autocommit = autocommit
|
|
92
93
|
self._notifications = []
|
|
93
94
|
self._state_thresholds = {}
|
|
95
|
+
self._retry_spec = {}
|
|
96
|
+
self._retry_count = None
|
|
94
97
|
|
|
95
98
|
self._labels = {}
|
|
96
99
|
self._annotations = {}
|
|
@@ -432,6 +435,7 @@ class MLClientCtx:
|
|
|
432
435
|
self._tolerations = spec.get("tolerations", self._tolerations)
|
|
433
436
|
self._affinity = spec.get("affinity", self._affinity)
|
|
434
437
|
self._reset_on_run = spec.get("reset_on_run", self._reset_on_run)
|
|
438
|
+
self._retry_spec = spec.get("retry", self._retry_spec)
|
|
435
439
|
|
|
436
440
|
self._init_dbs(rundb)
|
|
437
441
|
|
|
@@ -450,10 +454,11 @@ class MLClientCtx:
|
|
|
450
454
|
if start:
|
|
451
455
|
start = parser.parse(start) if isinstance(start, str) else start
|
|
452
456
|
self._start_time = start
|
|
453
|
-
self._state =
|
|
457
|
+
self._state = mlrun.common.runtimes.constants.RunStates.running
|
|
454
458
|
|
|
455
459
|
status = attrs.get("status")
|
|
456
|
-
|
|
460
|
+
retry_configured = self._retry_spec and self._retry_spec.get("count")
|
|
461
|
+
if (include_status or retry_configured) and status:
|
|
457
462
|
self._results = status.get("results", self._results)
|
|
458
463
|
for artifact in status.get("artifacts", []):
|
|
459
464
|
artifact_obj = dict_to_artifact(artifact)
|
|
@@ -462,7 +467,10 @@ class MLClientCtx:
|
|
|
462
467
|
)
|
|
463
468
|
for key, uri in status.get("artifact_uris", {}).items():
|
|
464
469
|
self._artifacts_manager.artifact_uris[key] = uri
|
|
465
|
-
self.
|
|
470
|
+
self._retry_count = status.get("retry_count", self._retry_count)
|
|
471
|
+
# if run is a retry, the state needs to move to running
|
|
472
|
+
if include_status:
|
|
473
|
+
self._state = status.get("state", self._state)
|
|
466
474
|
|
|
467
475
|
# No need to store the run for every worker
|
|
468
476
|
if store_run and self.is_logging_worker():
|
|
@@ -953,6 +961,11 @@ class MLClientCtx:
|
|
|
953
961
|
:returns: The logged `LLMPromptArtifact` object.
|
|
954
962
|
"""
|
|
955
963
|
|
|
964
|
+
if not prompt_string and not prompt_path:
|
|
965
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
966
|
+
"Either 'prompt_string' or 'prompt_path' must be provided"
|
|
967
|
+
)
|
|
968
|
+
|
|
956
969
|
llm_prompt = LLMPromptArtifact(
|
|
957
970
|
key=key,
|
|
958
971
|
project=self.project or "",
|
|
@@ -1107,13 +1120,13 @@ class MLClientCtx:
|
|
|
1107
1120
|
:param completed: Mark run as completed
|
|
1108
1121
|
"""
|
|
1109
1122
|
# Changing state to completed is allowed only when the execution is in running state
|
|
1110
|
-
if self._state !=
|
|
1123
|
+
if self._state != mlrun.common.runtimes.constants.RunStates.running:
|
|
1111
1124
|
completed = False
|
|
1112
1125
|
|
|
1113
1126
|
if message:
|
|
1114
1127
|
self._annotations["message"] = message
|
|
1115
1128
|
if completed:
|
|
1116
|
-
self._state =
|
|
1129
|
+
self._state = mlrun.common.runtimes.constants.RunStates.completed
|
|
1117
1130
|
|
|
1118
1131
|
if self._parent:
|
|
1119
1132
|
self._parent.update_child_iterations()
|
|
@@ -1147,9 +1160,15 @@ class MLClientCtx:
|
|
|
1147
1160
|
updates = {"status.last_update": now_date().isoformat()}
|
|
1148
1161
|
|
|
1149
1162
|
if error is not None:
|
|
1150
|
-
|
|
1163
|
+
state = mlrun.common.runtimes.constants.RunStates.error
|
|
1164
|
+
max_retries = self._retry_spec.get("count", 0)
|
|
1165
|
+
self._retry_count = self._retry_count or 0
|
|
1166
|
+
if max_retries and self._retry_count < max_retries:
|
|
1167
|
+
state = mlrun.common.runtimes.constants.RunStates.pending_retry
|
|
1168
|
+
|
|
1169
|
+
self._state = state
|
|
1151
1170
|
self._error = str(error)
|
|
1152
|
-
updates["status.state"] =
|
|
1171
|
+
updates["status.state"] = state
|
|
1153
1172
|
updates["status.error"] = error
|
|
1154
1173
|
elif (
|
|
1155
1174
|
execution_state
|
|
@@ -1241,11 +1260,13 @@ class MLClientCtx:
|
|
|
1241
1260
|
"node_selector": self._node_selector,
|
|
1242
1261
|
"tolerations": self._tolerations,
|
|
1243
1262
|
"affinity": self._affinity,
|
|
1263
|
+
"retry": self._retry_spec,
|
|
1244
1264
|
},
|
|
1245
1265
|
"status": {
|
|
1246
1266
|
"results": self._results,
|
|
1247
1267
|
"start_time": to_date_str(self._start_time),
|
|
1248
1268
|
"last_update": to_date_str(self._last_update),
|
|
1269
|
+
"retry_count": self._retry_count,
|
|
1249
1270
|
},
|
|
1250
1271
|
}
|
|
1251
1272
|
|
mlrun/launcher/base.py
CHANGED
|
@@ -18,6 +18,8 @@ import os
|
|
|
18
18
|
import uuid
|
|
19
19
|
from typing import Any, Callable, Optional, Union
|
|
20
20
|
|
|
21
|
+
import mlrun.common.constants
|
|
22
|
+
import mlrun.common.runtimes.constants
|
|
21
23
|
import mlrun.common.schemas
|
|
22
24
|
import mlrun.config
|
|
23
25
|
import mlrun.errors
|
|
@@ -72,6 +74,7 @@ class BaseLauncher(abc.ABC):
|
|
|
72
74
|
notifications: Optional[list[mlrun.model.Notification]] = None,
|
|
73
75
|
returns: Optional[list[Union[str, dict[str, str]]]] = None,
|
|
74
76
|
state_thresholds: Optional[dict[str, int]] = None,
|
|
77
|
+
retry: Optional[Union[mlrun.model.Retry, dict]] = None,
|
|
75
78
|
) -> "mlrun.run.RunObject":
|
|
76
79
|
"""run the function from the server/client[local/remote]"""
|
|
77
80
|
pass
|
|
@@ -133,7 +136,7 @@ class BaseLauncher(abc.ABC):
|
|
|
133
136
|
"""Check if the runtime requires to build the image and updates the spec accordingly"""
|
|
134
137
|
pass
|
|
135
138
|
|
|
136
|
-
def
|
|
139
|
+
def _validate_run(
|
|
137
140
|
self,
|
|
138
141
|
runtime: "mlrun.runtimes.BaseRuntime",
|
|
139
142
|
run: "mlrun.run.RunObject",
|
|
@@ -194,7 +197,7 @@ class BaseLauncher(abc.ABC):
|
|
|
194
197
|
)
|
|
195
198
|
|
|
196
199
|
@classmethod
|
|
197
|
-
def _validate_run_single_param(cls, param_name, param_value):
|
|
200
|
+
def _validate_run_single_param(cls, param_name: str, param_value: int):
|
|
198
201
|
# verify that integer parameters don't exceed a int64
|
|
199
202
|
if isinstance(param_value, int) and abs(param_value) >= 2**63:
|
|
200
203
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
@@ -203,8 +206,6 @@ class BaseLauncher(abc.ABC):
|
|
|
203
206
|
|
|
204
207
|
@staticmethod
|
|
205
208
|
def _create_run_object(task):
|
|
206
|
-
valid_task_types = (dict, mlrun.run.RunTemplate, mlrun.run.RunObject)
|
|
207
|
-
|
|
208
209
|
if not task:
|
|
209
210
|
# if task passed generate default RunObject
|
|
210
211
|
return mlrun.run.RunObject.from_dict(task)
|
|
@@ -215,18 +216,18 @@ class BaseLauncher(abc.ABC):
|
|
|
215
216
|
if isinstance(task, str):
|
|
216
217
|
task = ast.literal_eval(task)
|
|
217
218
|
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
|
|
221
|
-
|
|
222
|
-
|
|
219
|
+
valid_task_types = (dict, mlrun.run.RunTemplate, mlrun.run.RunObject)
|
|
220
|
+
if isinstance(task, mlrun.run.RunObject):
|
|
221
|
+
# if task is already a RunObject, we can return it as is
|
|
222
|
+
return task
|
|
223
223
|
if isinstance(task, mlrun.run.RunTemplate):
|
|
224
224
|
return mlrun.run.RunObject.from_template(task)
|
|
225
225
|
elif isinstance(task, dict):
|
|
226
226
|
return mlrun.run.RunObject.from_dict(task)
|
|
227
227
|
|
|
228
|
-
|
|
229
|
-
|
|
228
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
229
|
+
f"Task is not a valid object, type={type(task)}, expected types={valid_task_types}"
|
|
230
|
+
)
|
|
230
231
|
|
|
231
232
|
@staticmethod
|
|
232
233
|
def _enrich_run(
|
|
@@ -246,6 +247,7 @@ class BaseLauncher(abc.ABC):
|
|
|
246
247
|
workdir=None,
|
|
247
248
|
notifications: Optional[list[mlrun.model.Notification]] = None,
|
|
248
249
|
state_thresholds: Optional[dict[str, int]] = None,
|
|
250
|
+
retry: Optional[Union[mlrun.model.Retry, dict]] = None,
|
|
249
251
|
):
|
|
250
252
|
run.spec.handler = (
|
|
251
253
|
handler or run.spec.handler or runtime.spec.default_handler or ""
|
|
@@ -364,6 +366,7 @@ class BaseLauncher(abc.ABC):
|
|
|
364
366
|
| state_thresholds
|
|
365
367
|
)
|
|
366
368
|
run.spec.state_thresholds = state_thresholds or run.spec.state_thresholds
|
|
369
|
+
run.spec.retry = retry or run.spec.retry
|
|
367
370
|
return run
|
|
368
371
|
|
|
369
372
|
@staticmethod
|
|
@@ -410,7 +413,7 @@ class BaseLauncher(abc.ABC):
|
|
|
410
413
|
)
|
|
411
414
|
if (
|
|
412
415
|
run.status.state
|
|
413
|
-
in mlrun.common.runtimes.constants.RunStates.
|
|
416
|
+
in mlrun.common.runtimes.constants.RunStates.error_states()
|
|
414
417
|
):
|
|
415
418
|
if runtime._is_remote and not runtime.is_child:
|
|
416
419
|
logger.error(
|
|
@@ -418,7 +421,14 @@ class BaseLauncher(abc.ABC):
|
|
|
418
421
|
state=run.status.state,
|
|
419
422
|
status=run.status.to_dict(),
|
|
420
423
|
)
|
|
421
|
-
|
|
424
|
+
|
|
425
|
+
error = run.error
|
|
426
|
+
if (
|
|
427
|
+
run.status.state
|
|
428
|
+
== mlrun.common.runtimes.constants.RunStates.pending_retry
|
|
429
|
+
):
|
|
430
|
+
error = f"Run is pending retry, error: {run.error}"
|
|
431
|
+
raise mlrun.runtimes.utils.RunError(error)
|
|
422
432
|
return run
|
|
423
433
|
|
|
424
434
|
return None
|
mlrun/launcher/local.py
CHANGED
|
@@ -72,6 +72,7 @@ class ClientLocalLauncher(launcher.ClientBaseLauncher):
|
|
|
72
72
|
returns: Optional[list[Union[str, dict[str, str]]]] = None,
|
|
73
73
|
state_thresholds: Optional[dict[str, int]] = None,
|
|
74
74
|
reset_on_run: Optional[bool] = None,
|
|
75
|
+
retry: Optional[Union[mlrun.model.Retry, dict]] = None,
|
|
75
76
|
) -> "mlrun.run.RunObject":
|
|
76
77
|
# do not allow local function to be scheduled
|
|
77
78
|
if schedule is not None:
|
|
@@ -122,8 +123,9 @@ class ClientLocalLauncher(launcher.ClientBaseLauncher):
|
|
|
122
123
|
workdir=workdir,
|
|
123
124
|
notifications=notifications,
|
|
124
125
|
state_thresholds=state_thresholds,
|
|
126
|
+
retry=retry,
|
|
125
127
|
)
|
|
126
|
-
self.
|
|
128
|
+
self._validate_run(runtime, run)
|
|
127
129
|
result = self._execute(
|
|
128
130
|
runtime=runtime,
|
|
129
131
|
run=run,
|
mlrun/launcher/remote.py
CHANGED
|
@@ -61,6 +61,7 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
|
|
|
61
61
|
returns: Optional[list[Union[str, dict[str, str]]]] = None,
|
|
62
62
|
state_thresholds: Optional[dict[str, int]] = None,
|
|
63
63
|
reset_on_run: Optional[bool] = None,
|
|
64
|
+
retry: Optional[Union[mlrun.model.Retry, dict]] = None,
|
|
64
65
|
) -> "mlrun.run.RunObject":
|
|
65
66
|
self.enrich_runtime(runtime, project)
|
|
66
67
|
run = self._create_run_object(task)
|
|
@@ -82,8 +83,9 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
|
|
|
82
83
|
workdir=workdir,
|
|
83
84
|
notifications=notifications,
|
|
84
85
|
state_thresholds=state_thresholds,
|
|
86
|
+
retry=retry,
|
|
85
87
|
)
|
|
86
|
-
self.
|
|
88
|
+
self._validate_run(runtime, run)
|
|
87
89
|
|
|
88
90
|
if not runtime.is_deployed():
|
|
89
91
|
if runtime.spec.build.auto_build or auto_build:
|
|
@@ -190,7 +192,7 @@ class ClientRemoteLauncher(launcher.ClientBaseLauncher):
|
|
|
190
192
|
return self._wrap_run_result(runtime, resp, run, schedule=schedule)
|
|
191
193
|
|
|
192
194
|
@classmethod
|
|
193
|
-
def _validate_run_single_param(cls, param_name, param_value):
|
|
195
|
+
def _validate_run_single_param(cls, param_name: str, param_value: int):
|
|
194
196
|
if isinstance(param_value, pd.DataFrame):
|
|
195
197
|
raise mlrun.errors.MLRunInvalidArgumentTypeError(
|
|
196
198
|
f"Parameter '{param_name}' has an unsupported value of type"
|
mlrun/model.py
CHANGED
|
@@ -935,6 +935,41 @@ class HyperParamOptions(ModelObj):
|
|
|
935
935
|
)
|
|
936
936
|
|
|
937
937
|
|
|
938
|
+
class RetryBackoff(ModelObj):
|
|
939
|
+
"""Backoff strategy for retries."""
|
|
940
|
+
|
|
941
|
+
def __init__(self, base_delay: Optional[str] = None):
|
|
942
|
+
# The base_delay time string must conform to timelength python package standards and be at least
|
|
943
|
+
# mlrun.mlconf.function.spec.retry.backoff.min_base_delay (e.g. 1000s, 1 hour 30m, 1h etc.).
|
|
944
|
+
self.base_delay = (
|
|
945
|
+
base_delay or mlrun.mlconf.function.spec.retry.backoff.default_base_delay
|
|
946
|
+
)
|
|
947
|
+
|
|
948
|
+
|
|
949
|
+
class Retry(ModelObj):
|
|
950
|
+
"""Retry configuration"""
|
|
951
|
+
|
|
952
|
+
def __init__(
|
|
953
|
+
self,
|
|
954
|
+
count: int = 0,
|
|
955
|
+
backoff: typing.Union[RetryBackoff, dict] = None,
|
|
956
|
+
):
|
|
957
|
+
# Set to None if count is 0 to eliminate the retry configuration from the dictionary representation.
|
|
958
|
+
self.count = count or None
|
|
959
|
+
self.backoff = backoff
|
|
960
|
+
|
|
961
|
+
@property
|
|
962
|
+
def backoff(self) -> Optional[RetryBackoff]:
|
|
963
|
+
if not self.count:
|
|
964
|
+
# Retry is not configured, return None
|
|
965
|
+
return None
|
|
966
|
+
return self._backoff
|
|
967
|
+
|
|
968
|
+
@backoff.setter
|
|
969
|
+
def backoff(self, backoff):
|
|
970
|
+
self._backoff = self._verify_dict(backoff, "backoff", RetryBackoff)
|
|
971
|
+
|
|
972
|
+
|
|
938
973
|
class RunSpec(ModelObj):
|
|
939
974
|
"""Run specification"""
|
|
940
975
|
|
|
@@ -971,6 +1006,7 @@ class RunSpec(ModelObj):
|
|
|
971
1006
|
node_selector=None,
|
|
972
1007
|
tolerations=None,
|
|
973
1008
|
affinity=None,
|
|
1009
|
+
retry=None,
|
|
974
1010
|
):
|
|
975
1011
|
# A dictionary of parsing configurations that will be read from the inputs the user set. The keys are the inputs
|
|
976
1012
|
# keys (parameter names) and the values are the type hint given in the input keys after the colon.
|
|
@@ -1011,6 +1047,7 @@ class RunSpec(ModelObj):
|
|
|
1011
1047
|
self.node_selector = node_selector or {}
|
|
1012
1048
|
self.tolerations = tolerations or {}
|
|
1013
1049
|
self.affinity = affinity or {}
|
|
1050
|
+
self.retry = retry or {}
|
|
1014
1051
|
|
|
1015
1052
|
def _serialize_field(
|
|
1016
1053
|
self, struct: dict, field_name: Optional[str] = None, strip: bool = False
|
|
@@ -1212,6 +1249,14 @@ class RunSpec(ModelObj):
|
|
|
1212
1249
|
self._verify_dict(state_thresholds, "state_thresholds")
|
|
1213
1250
|
self._state_thresholds = state_thresholds
|
|
1214
1251
|
|
|
1252
|
+
@property
|
|
1253
|
+
def retry(self) -> Retry:
|
|
1254
|
+
return self._retry
|
|
1255
|
+
|
|
1256
|
+
@retry.setter
|
|
1257
|
+
def retry(self, retry: typing.Union[Retry, dict]):
|
|
1258
|
+
self._retry = self._verify_dict(retry, "retry", Retry)
|
|
1259
|
+
|
|
1215
1260
|
def extract_type_hints_from_inputs(self):
|
|
1216
1261
|
"""
|
|
1217
1262
|
This method extracts the type hints from the input keys in the input dictionary.
|
|
@@ -1329,6 +1374,7 @@ class RunStatus(ModelObj):
|
|
|
1329
1374
|
reason: Optional[str] = None,
|
|
1330
1375
|
notifications: Optional[dict[str, Notification]] = None,
|
|
1331
1376
|
artifact_uris: Optional[dict[str, str]] = None,
|
|
1377
|
+
retry_count: Optional[int] = None,
|
|
1332
1378
|
):
|
|
1333
1379
|
self.state = state or "created"
|
|
1334
1380
|
self.status_text = status_text
|
|
@@ -1346,6 +1392,7 @@ class RunStatus(ModelObj):
|
|
|
1346
1392
|
self.notifications = notifications or {}
|
|
1347
1393
|
# Artifact key -> URI mapping, since the full artifacts are not stored in the runs DB table
|
|
1348
1394
|
self._artifact_uris = artifact_uris or {}
|
|
1395
|
+
self._retry_count = retry_count or None
|
|
1349
1396
|
|
|
1350
1397
|
@classmethod
|
|
1351
1398
|
def from_dict(
|
|
@@ -1399,6 +1446,21 @@ class RunStatus(ModelObj):
|
|
|
1399
1446
|
|
|
1400
1447
|
self._artifact_uris = resolved_artifact_uris
|
|
1401
1448
|
|
|
1449
|
+
@property
|
|
1450
|
+
def retry_count(self) -> Optional[int]:
|
|
1451
|
+
"""
|
|
1452
|
+
The number of retries that were made for this run.
|
|
1453
|
+
"""
|
|
1454
|
+
return self._retry_count
|
|
1455
|
+
|
|
1456
|
+
@retry_count.setter
|
|
1457
|
+
def retry_count(self, retry_count: int):
|
|
1458
|
+
"""
|
|
1459
|
+
Set the number of retries that were made for this run.
|
|
1460
|
+
:param retry_count: The number of retries.
|
|
1461
|
+
"""
|
|
1462
|
+
self._retry_count = retry_count
|
|
1463
|
+
|
|
1402
1464
|
def is_failed(self) -> Optional[bool]:
|
|
1403
1465
|
"""
|
|
1404
1466
|
This method returns whether a run has failed.
|
|
@@ -2026,6 +2088,7 @@ def new_task(
|
|
|
2026
2088
|
secrets=None,
|
|
2027
2089
|
base=None,
|
|
2028
2090
|
returns=None,
|
|
2091
|
+
retry=None,
|
|
2029
2092
|
) -> RunTemplate:
|
|
2030
2093
|
"""Creates a new task
|
|
2031
2094
|
|
|
@@ -2061,6 +2124,7 @@ def new_task(
|
|
|
2061
2124
|
* A dictionary of configurations to use when logging. Further info per object type and
|
|
2062
2125
|
artifact type can be given there. The artifact key must appear in the dictionary as
|
|
2063
2126
|
"key": "the_key".
|
|
2127
|
+
:param retry: Retry configuration for the run, can be a dict or an instance of mlrun.model.Retry.
|
|
2064
2128
|
"""
|
|
2065
2129
|
|
|
2066
2130
|
if base:
|
|
@@ -2086,6 +2150,7 @@ def new_task(
|
|
|
2086
2150
|
run.spec.hyper_param_options.selector = (
|
|
2087
2151
|
selector or run.spec.hyper_param_options.selector
|
|
2088
2152
|
)
|
|
2153
|
+
run.spec.retry = retry or run.spec.retry
|
|
2089
2154
|
return run
|
|
2090
2155
|
|
|
2091
2156
|
|
|
@@ -804,25 +804,45 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
804
804
|
@staticmethod
|
|
805
805
|
def _get_sql_query(
|
|
806
806
|
*,
|
|
807
|
-
endpoint_id: str,
|
|
808
807
|
table_path: str,
|
|
808
|
+
endpoint_id: Optional[str] = None,
|
|
809
|
+
application_names: Optional[list[str]] = None,
|
|
809
810
|
name: str = mm_schemas.ResultData.RESULT_NAME,
|
|
810
811
|
metric_and_app_names: Optional[list[tuple[str, str]]] = None,
|
|
811
812
|
columns: Optional[list[str]] = None,
|
|
813
|
+
group_by_columns: Optional[list[str]] = None,
|
|
812
814
|
) -> str:
|
|
813
815
|
"""Get the SQL query for the results/metrics table"""
|
|
816
|
+
|
|
817
|
+
if metric_and_app_names and not endpoint_id:
|
|
818
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
819
|
+
"If metric_and_app_names is provided, endpoint_id must also be provided"
|
|
820
|
+
)
|
|
821
|
+
|
|
822
|
+
if metric_and_app_names and application_names:
|
|
823
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
824
|
+
"Cannot provide both metric_and_app_names and application_names"
|
|
825
|
+
)
|
|
826
|
+
|
|
814
827
|
if columns:
|
|
815
828
|
selection = ",".join(columns)
|
|
816
829
|
else:
|
|
817
830
|
selection = "*"
|
|
818
831
|
|
|
819
832
|
with StringIO() as query:
|
|
820
|
-
|
|
821
|
-
|
|
822
|
-
|
|
823
|
-
|
|
833
|
+
where_added = False
|
|
834
|
+
query.write(f"SELECT {selection} FROM '{table_path}'")
|
|
835
|
+
if endpoint_id:
|
|
836
|
+
query.write(
|
|
837
|
+
f" WHERE {mm_schemas.WriterEvent.ENDPOINT_ID}='{endpoint_id}'"
|
|
838
|
+
)
|
|
839
|
+
where_added = True
|
|
824
840
|
if metric_and_app_names:
|
|
825
|
-
|
|
841
|
+
if where_added:
|
|
842
|
+
query.write(" AND (")
|
|
843
|
+
else:
|
|
844
|
+
query.write(" WHERE (")
|
|
845
|
+
where_added = True
|
|
826
846
|
|
|
827
847
|
for i, (app_name, result_name) in enumerate(metric_and_app_names):
|
|
828
848
|
sub_cond = (
|
|
@@ -835,6 +855,22 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
835
855
|
|
|
836
856
|
query.write(")")
|
|
837
857
|
|
|
858
|
+
if application_names:
|
|
859
|
+
if where_added:
|
|
860
|
+
query.write(" AND (")
|
|
861
|
+
else:
|
|
862
|
+
query.write(" WHERE (")
|
|
863
|
+
for i, app_name in enumerate(application_names):
|
|
864
|
+
sub_cond = f"{mm_schemas.WriterEvent.APPLICATION_NAME}='{app_name}'"
|
|
865
|
+
if i != 0: # not first sub condition
|
|
866
|
+
query.write(" OR ")
|
|
867
|
+
query.write(sub_cond)
|
|
868
|
+
query.write(")")
|
|
869
|
+
|
|
870
|
+
if group_by_columns:
|
|
871
|
+
query.write(" GROUP BY ")
|
|
872
|
+
query.write(",".join(group_by_columns))
|
|
873
|
+
|
|
838
874
|
query.write(";")
|
|
839
875
|
return query.getvalue()
|
|
840
876
|
|
|
@@ -1272,7 +1308,49 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
1272
1308
|
end: Optional[Union[datetime, str]] = None,
|
|
1273
1309
|
application_names: Optional[Union[str, list[str]]] = None,
|
|
1274
1310
|
) -> dict[str, int]:
|
|
1275
|
-
|
|
1311
|
+
start, end = get_start_end(start=start, end=end, delta=timedelta(hours=24))
|
|
1312
|
+
group_by_columns = [
|
|
1313
|
+
mm_schemas.ApplicationEvent.APPLICATION_NAME,
|
|
1314
|
+
mm_schemas.ApplicationEvent.ENDPOINT_ID,
|
|
1315
|
+
]
|
|
1316
|
+
|
|
1317
|
+
def get_application_endpoints_records(
|
|
1318
|
+
record_type: Literal["metrics", "results"],
|
|
1319
|
+
):
|
|
1320
|
+
if record_type == "results":
|
|
1321
|
+
table_path = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
|
|
1322
|
+
else:
|
|
1323
|
+
table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
|
|
1324
|
+
sql_query = self._get_sql_query(
|
|
1325
|
+
table_path=table_path,
|
|
1326
|
+
columns=[mm_schemas.WriterEvent.START_INFER_TIME],
|
|
1327
|
+
group_by_columns=group_by_columns,
|
|
1328
|
+
application_names=application_names,
|
|
1329
|
+
)
|
|
1330
|
+
return self.frames_client.read(
|
|
1331
|
+
backend=_TSDB_BE,
|
|
1332
|
+
start=start,
|
|
1333
|
+
end=end,
|
|
1334
|
+
query=sql_query,
|
|
1335
|
+
)
|
|
1336
|
+
|
|
1337
|
+
df_results = get_application_endpoints_records("results")
|
|
1338
|
+
df_metrics = get_application_endpoints_records("metrics")
|
|
1339
|
+
|
|
1340
|
+
if df_results.empty and df_metrics.empty:
|
|
1341
|
+
return {}
|
|
1342
|
+
|
|
1343
|
+
# Combine the two dataframes and count unique endpoints per application
|
|
1344
|
+
combined_df = pd.concat([df_results, df_metrics], ignore_index=True)
|
|
1345
|
+
if combined_df.empty:
|
|
1346
|
+
return {}
|
|
1347
|
+
combined_df.drop_duplicates(subset=group_by_columns, inplace=True)
|
|
1348
|
+
|
|
1349
|
+
grouped_df = combined_df.groupby(
|
|
1350
|
+
mm_schemas.WriterEvent.APPLICATION_NAME
|
|
1351
|
+
).count()
|
|
1352
|
+
|
|
1353
|
+
return grouped_df[mm_schemas.WriterEvent.ENDPOINT_ID].to_dict()
|
|
1276
1354
|
|
|
1277
1355
|
def calculate_latest_metrics(
|
|
1278
1356
|
self,
|
|
@@ -1282,4 +1360,93 @@ class V3IOTSDBConnector(TSDBConnector):
|
|
|
1282
1360
|
) -> list[
|
|
1283
1361
|
Union[mm_schemas.ApplicationResultRecord, mm_schemas.ApplicationMetricRecord]
|
|
1284
1362
|
]:
|
|
1285
|
-
|
|
1363
|
+
metric_list = []
|
|
1364
|
+
start, end = get_start_end(start=start, end=end, delta=timedelta(hours=24))
|
|
1365
|
+
|
|
1366
|
+
# Get the latest results
|
|
1367
|
+
def get_latest_metrics_records(
|
|
1368
|
+
record_type: Literal["metrics", "results"],
|
|
1369
|
+
) -> pd.DataFrame:
|
|
1370
|
+
group_by_columns = [mm_schemas.ApplicationEvent.APPLICATION_NAME]
|
|
1371
|
+
if record_type == "results":
|
|
1372
|
+
table_path = self.tables[mm_schemas.V3IOTSDBTables.APP_RESULTS]
|
|
1373
|
+
columns = [
|
|
1374
|
+
f"last({mm_schemas.ResultData.RESULT_STATUS})",
|
|
1375
|
+
f"last({mm_schemas.ResultData.RESULT_VALUE})",
|
|
1376
|
+
f"last({mm_schemas.ResultData.RESULT_KIND})",
|
|
1377
|
+
]
|
|
1378
|
+
group_by_columns += [
|
|
1379
|
+
mm_schemas.ResultData.RESULT_NAME,
|
|
1380
|
+
]
|
|
1381
|
+
else:
|
|
1382
|
+
table_path = self.tables[mm_schemas.V3IOTSDBTables.METRICS]
|
|
1383
|
+
columns = [f"last({mm_schemas.MetricData.METRIC_VALUE})"]
|
|
1384
|
+
group_by_columns += [
|
|
1385
|
+
mm_schemas.MetricData.METRIC_NAME,
|
|
1386
|
+
]
|
|
1387
|
+
sql_query = self._get_sql_query(
|
|
1388
|
+
table_path=table_path,
|
|
1389
|
+
columns=columns,
|
|
1390
|
+
group_by_columns=group_by_columns,
|
|
1391
|
+
application_names=application_names,
|
|
1392
|
+
)
|
|
1393
|
+
|
|
1394
|
+
return self.frames_client.read(
|
|
1395
|
+
backend=_TSDB_BE,
|
|
1396
|
+
start=start,
|
|
1397
|
+
end=end,
|
|
1398
|
+
query=sql_query,
|
|
1399
|
+
)
|
|
1400
|
+
|
|
1401
|
+
df_results = get_latest_metrics_records("results")
|
|
1402
|
+
df_metrics = get_latest_metrics_records("metrics")
|
|
1403
|
+
|
|
1404
|
+
if df_results.empty and df_metrics.empty:
|
|
1405
|
+
return metric_list
|
|
1406
|
+
|
|
1407
|
+
# Convert the results DataFrame to a list of ApplicationResultRecord
|
|
1408
|
+
def build_metric_objects() -> (
|
|
1409
|
+
list[
|
|
1410
|
+
Union[
|
|
1411
|
+
mm_schemas.ApplicationResultRecord,
|
|
1412
|
+
mm_schemas.ApplicationMetricRecord,
|
|
1413
|
+
]
|
|
1414
|
+
]
|
|
1415
|
+
):
|
|
1416
|
+
metric_objects = []
|
|
1417
|
+
if not df_results.empty:
|
|
1418
|
+
df_results.rename(
|
|
1419
|
+
columns={
|
|
1420
|
+
f"last({mm_schemas.ResultData.RESULT_VALUE})": mm_schemas.ResultData.RESULT_VALUE,
|
|
1421
|
+
f"last({mm_schemas.ResultData.RESULT_STATUS})": mm_schemas.ResultData.RESULT_STATUS,
|
|
1422
|
+
f"last({mm_schemas.ResultData.RESULT_KIND})": mm_schemas.ResultData.RESULT_KIND,
|
|
1423
|
+
},
|
|
1424
|
+
inplace=True,
|
|
1425
|
+
)
|
|
1426
|
+
for _, row in df_results.iterrows():
|
|
1427
|
+
metric_objects.append(
|
|
1428
|
+
mm_schemas.ApplicationResultRecord(
|
|
1429
|
+
result_name=row[mm_schemas.ResultData.RESULT_NAME],
|
|
1430
|
+
kind=row[mm_schemas.ResultData.RESULT_KIND],
|
|
1431
|
+
status=row[mm_schemas.ResultData.RESULT_STATUS],
|
|
1432
|
+
value=row[mm_schemas.ResultData.RESULT_VALUE],
|
|
1433
|
+
)
|
|
1434
|
+
)
|
|
1435
|
+
if not df_metrics.empty:
|
|
1436
|
+
df_metrics.rename(
|
|
1437
|
+
columns={
|
|
1438
|
+
f"last({mm_schemas.MetricData.METRIC_VALUE})": mm_schemas.MetricData.METRIC_VALUE,
|
|
1439
|
+
},
|
|
1440
|
+
inplace=True,
|
|
1441
|
+
)
|
|
1442
|
+
|
|
1443
|
+
for _, row in df_metrics.iterrows():
|
|
1444
|
+
metric_objects.append(
|
|
1445
|
+
mm_schemas.ApplicationMetricRecord(
|
|
1446
|
+
metric_name=row[mm_schemas.MetricData.METRIC_NAME],
|
|
1447
|
+
value=row[mm_schemas.MetricData.METRIC_VALUE],
|
|
1448
|
+
)
|
|
1449
|
+
)
|
|
1450
|
+
return metric_objects
|
|
1451
|
+
|
|
1452
|
+
return build_metric_objects()
|
|
@@ -21,6 +21,7 @@ from typing import Any, Optional, Union
|
|
|
21
21
|
|
|
22
22
|
import mlrun.errors
|
|
23
23
|
from mlrun.artifacts import Artifact
|
|
24
|
+
from mlrun.artifacts.base import verify_target_path
|
|
24
25
|
from mlrun.datastore import DataItem, get_store_resource, store_manager
|
|
25
26
|
from mlrun.errors import MLRunInvalidArgumentError
|
|
26
27
|
from mlrun.utils import logger
|
|
@@ -276,6 +277,7 @@ class PackagersManager:
|
|
|
276
277
|
if data_item.get_artifact_type():
|
|
277
278
|
# Get the artifact object in the data item:
|
|
278
279
|
artifact, _ = store_manager.get_store_artifact(url=data_item.artifact_url)
|
|
280
|
+
verify_target_path(artifact)
|
|
279
281
|
# Get the key from the artifact's metadata and instructions from the artifact's spec:
|
|
280
282
|
artifact_key = artifact.metadata.key
|
|
281
283
|
packaging_instructions = artifact.spec.unpackaging_instructions
|
mlrun/projects/operations.py
CHANGED
|
@@ -20,7 +20,6 @@ import mlrun
|
|
|
20
20
|
import mlrun.common.constants as mlrun_constants
|
|
21
21
|
import mlrun.common.schemas.function
|
|
22
22
|
import mlrun.common.schemas.workflow
|
|
23
|
-
import mlrun_pipelines.common.models
|
|
24
23
|
import mlrun_pipelines.models
|
|
25
24
|
from mlrun.utils import hub_prefix
|
|
26
25
|
|
|
@@ -82,6 +81,7 @@ def run_function(
|
|
|
82
81
|
builder_env: Optional[list] = None,
|
|
83
82
|
reset_on_run: Optional[bool] = None,
|
|
84
83
|
output_path: Optional[str] = None,
|
|
84
|
+
retry: Optional[Union[mlrun.model.Retry, dict]] = None,
|
|
85
85
|
) -> Union[mlrun.model.RunObject, mlrun_pipelines.models.PipelineNodeWrapper]:
|
|
86
86
|
"""Run a local or remote task as part of a local/kubeflow pipeline
|
|
87
87
|
|
|
@@ -177,6 +177,7 @@ def run_function(
|
|
|
177
177
|
This ensures latest code changes are executed. This argument must be used in
|
|
178
178
|
conjunction with the local=True argument.
|
|
179
179
|
:param output_path: path to store artifacts, when running in a workflow this will be set automatically
|
|
180
|
+
:param retry: Retry configuration for the run, can be a dict or an instance of mlrun.model.Retry.
|
|
180
181
|
:return: MLRun RunObject or PipelineNodeWrapper
|
|
181
182
|
"""
|
|
182
183
|
if artifact_path:
|
|
@@ -197,6 +198,7 @@ def run_function(
|
|
|
197
198
|
returns=returns,
|
|
198
199
|
base=base_task,
|
|
199
200
|
selector=selector,
|
|
201
|
+
retry=retry,
|
|
200
202
|
)
|
|
201
203
|
task.spec.verbose = task.spec.verbose or verbose
|
|
202
204
|
|
|
@@ -205,6 +207,11 @@ def run_function(
|
|
|
205
207
|
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
206
208
|
"Scheduling jobs is not supported when running a workflow with the kfp engine."
|
|
207
209
|
)
|
|
210
|
+
if retry:
|
|
211
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
212
|
+
"Retrying jobs is not supported when running a workflow with the kfp engine. "
|
|
213
|
+
"Use KFP set_retry instead."
|
|
214
|
+
)
|
|
208
215
|
return function.as_step(
|
|
209
216
|
name=name, runspec=task, workdir=workdir, outputs=outputs, labels=labels
|
|
210
217
|
)
|