mlrun 1.7.0rc17__py3-none-any.whl → 1.7.0rc18__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/alerts/alert.py +1 -1
- mlrun/artifacts/manager.py +5 -1
- mlrun/common/runtimes/constants.py +3 -0
- mlrun/common/schemas/__init__.py +1 -1
- mlrun/common/schemas/alert.py +31 -9
- mlrun/common/schemas/client_spec.py +1 -0
- mlrun/common/schemas/function.py +4 -0
- mlrun/common/schemas/model_monitoring/__init__.py +3 -1
- mlrun/common/schemas/model_monitoring/constants.py +20 -1
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +17 -6
- mlrun/config.py +2 -0
- mlrun/data_types/to_pandas.py +5 -5
- mlrun/datastore/datastore.py +6 -2
- mlrun/datastore/redis.py +2 -2
- mlrun/datastore/s3.py +5 -0
- mlrun/datastore/sources.py +111 -6
- mlrun/datastore/targets.py +2 -2
- mlrun/db/base.py +5 -1
- mlrun/db/httpdb.py +22 -3
- mlrun/db/nopdb.py +5 -1
- mlrun/errors.py +6 -0
- mlrun/feature_store/retrieval/conversion.py +5 -5
- mlrun/feature_store/retrieval/job.py +3 -2
- mlrun/feature_store/retrieval/spark_merger.py +2 -1
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +2 -2
- mlrun/model_monitoring/db/stores/base/store.py +16 -3
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +44 -43
- mlrun/model_monitoring/db/stores/v3io_kv/kv_store.py +190 -91
- mlrun/model_monitoring/db/tsdb/__init__.py +35 -6
- mlrun/model_monitoring/db/tsdb/base.py +25 -18
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +207 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +231 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +73 -72
- mlrun/model_monitoring/db/v3io_tsdb_reader.py +217 -16
- mlrun/model_monitoring/helpers.py +32 -0
- mlrun/model_monitoring/stream_processing.py +7 -4
- mlrun/model_monitoring/writer.py +18 -13
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/projects/project.py +33 -8
- mlrun/render.py +8 -5
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/utils/async_http.py +25 -5
- mlrun/utils/helpers.py +20 -1
- mlrun/utils/notifications/notification/slack.py +27 -7
- mlrun/utils/notifications/notification_pusher.py +38 -40
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/METADATA +7 -2
- {mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/RECORD +55 -51
- {mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/WHEEL +0 -0
- {mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc17.dist-info → mlrun-1.7.0rc18.dist-info}/top_level.txt +0 -0
|
@@ -111,6 +111,24 @@ def get_connection_string(secret_provider: typing.Callable = None) -> str:
|
|
|
111
111
|
)
|
|
112
112
|
|
|
113
113
|
|
|
114
|
+
def get_tsdb_connection_string(
|
|
115
|
+
secret_provider: typing.Optional[typing.Callable] = None,
|
|
116
|
+
) -> str:
|
|
117
|
+
"""Get TSDB connection string from the project secret. If wasn't set, take it from the system
|
|
118
|
+
configurations.
|
|
119
|
+
:param secret_provider: An optional secret provider to get the connection string secret.
|
|
120
|
+
:return: Valid TSDB connection string.
|
|
121
|
+
"""
|
|
122
|
+
|
|
123
|
+
return (
|
|
124
|
+
mlrun.get_secret_or_env(
|
|
125
|
+
key=mlrun.common.schemas.model_monitoring.ProjectSecretKeys.TSDB_CONNECTION,
|
|
126
|
+
secret_provider=secret_provider,
|
|
127
|
+
)
|
|
128
|
+
or mlrun.mlconf.model_endpoint_monitoring.tsdb_connection
|
|
129
|
+
)
|
|
130
|
+
|
|
131
|
+
|
|
114
132
|
def batch_dict2timedelta(batch_dict: _BatchDict) -> datetime.timedelta:
|
|
115
133
|
"""
|
|
116
134
|
Convert a batch dictionary to timedelta.
|
|
@@ -260,3 +278,17 @@ def get_endpoint_record(project: str, endpoint_id: str):
|
|
|
260
278
|
project=project,
|
|
261
279
|
)
|
|
262
280
|
return model_endpoint_store.get_model_endpoint(endpoint_id=endpoint_id)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
def get_result_instance_fqn(
|
|
284
|
+
model_endpoint_id: str, app_name: str, result_name: str
|
|
285
|
+
) -> str:
|
|
286
|
+
return f"{model_endpoint_id}.{app_name}.result.{result_name}"
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
def get_default_result_instance_fqn(model_endpoint_id: str) -> str:
|
|
290
|
+
return get_result_instance_fqn(
|
|
291
|
+
model_endpoint_id,
|
|
292
|
+
mm_constants.HistogramDataDriftApplicationConstants.NAME,
|
|
293
|
+
mm_constants.HistogramDataDriftApplicationConstants.GENERAL_RESULT_NAME,
|
|
294
|
+
)
|
|
@@ -136,7 +136,11 @@ class EventStreamProcessor:
|
|
|
136
136
|
self.tsdb_batching_max_events = tsdb_batching_max_events
|
|
137
137
|
self.tsdb_batching_timeout_secs = tsdb_batching_timeout_secs
|
|
138
138
|
|
|
139
|
-
def apply_monitoring_serving_graph(
|
|
139
|
+
def apply_monitoring_serving_graph(
|
|
140
|
+
self,
|
|
141
|
+
fn: mlrun.runtimes.ServingRuntime,
|
|
142
|
+
tsdb_service_provider: typing.Optional[typing.Callable] = None,
|
|
143
|
+
) -> None:
|
|
140
144
|
"""
|
|
141
145
|
Apply monitoring serving graph to a given serving function. The following serving graph includes about 4 main
|
|
142
146
|
parts that each one them includes several steps of different operations that are executed on the events from
|
|
@@ -163,6 +167,7 @@ class EventStreamProcessor:
|
|
|
163
167
|
using CE, the parquet target path is based on the defined MLRun artifact path.
|
|
164
168
|
|
|
165
169
|
:param fn: A serving function.
|
|
170
|
+
:param tsdb_service_provider: An optional callable function that provides the TSDB connection string.
|
|
166
171
|
"""
|
|
167
172
|
|
|
168
173
|
graph = typing.cast(
|
|
@@ -322,15 +327,13 @@ class EventStreamProcessor:
|
|
|
322
327
|
|
|
323
328
|
# TSDB branch (skip to Prometheus if in CE env)
|
|
324
329
|
if not mlrun.mlconf.is_ce_mode():
|
|
325
|
-
# TSDB branch
|
|
326
330
|
tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
327
|
-
project=self.project,
|
|
331
|
+
project=self.project, secret_provider=tsdb_service_provider
|
|
328
332
|
)
|
|
329
333
|
tsdb_connector.apply_monitoring_stream_steps(graph=graph)
|
|
330
334
|
|
|
331
335
|
else:
|
|
332
336
|
# Prometheus
|
|
333
|
-
|
|
334
337
|
# Increase the prediction counter by 1 and update the latency value
|
|
335
338
|
graph.add_step(
|
|
336
339
|
"IncCounter",
|
mlrun/model_monitoring/writer.py
CHANGED
|
@@ -29,7 +29,7 @@ from mlrun.common.schemas.model_monitoring.constants import (
|
|
|
29
29
|
WriterEventKind,
|
|
30
30
|
)
|
|
31
31
|
from mlrun.common.schemas.notification import NotificationKind, NotificationSeverity
|
|
32
|
-
from mlrun.model_monitoring.helpers import get_endpoint_record
|
|
32
|
+
from mlrun.model_monitoring.helpers import get_endpoint_record, get_result_instance_fqn
|
|
33
33
|
from mlrun.serving.utils import StepToDict
|
|
34
34
|
from mlrun.utils import logger
|
|
35
35
|
from mlrun.utils.notifications.notification_pusher import CustomNotificationPusher
|
|
@@ -101,7 +101,7 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
101
101
|
|
|
102
102
|
kind = "monitoring_application_stream_pusher"
|
|
103
103
|
|
|
104
|
-
def __init__(self, project: str) -> None:
|
|
104
|
+
def __init__(self, project: str, tsdb_secret_provider=None) -> None:
|
|
105
105
|
self.project = project
|
|
106
106
|
self.name = project # required for the deployment process
|
|
107
107
|
|
|
@@ -113,24 +113,24 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
113
113
|
project=self.project
|
|
114
114
|
)
|
|
115
115
|
self._tsdb_connector = mlrun.model_monitoring.get_tsdb_connector(
|
|
116
|
-
project=self.project,
|
|
116
|
+
project=self.project, secret_provider=tsdb_secret_provider
|
|
117
117
|
)
|
|
118
118
|
self._endpoints_records = {}
|
|
119
119
|
|
|
120
120
|
@staticmethod
|
|
121
121
|
def _generate_event_on_drift(
|
|
122
|
-
|
|
122
|
+
entity_id: str, drift_status: str, event_value: dict, project_name: str
|
|
123
123
|
) -> None:
|
|
124
|
-
logger.info("Sending an
|
|
124
|
+
logger.info("Sending an event")
|
|
125
125
|
entity = mlrun.common.schemas.alert.EventEntities(
|
|
126
|
-
kind=alert_objects.EventEntityKind.
|
|
126
|
+
kind=alert_objects.EventEntityKind.MODEL_ENDPOINT_RESULT,
|
|
127
127
|
project=project_name,
|
|
128
|
-
ids=[
|
|
128
|
+
ids=[entity_id],
|
|
129
129
|
)
|
|
130
130
|
event_kind = (
|
|
131
|
-
alert_objects.EventKind.
|
|
131
|
+
alert_objects.EventKind.DATA_DRIFT_DETECTED
|
|
132
132
|
if drift_status == ResultStatusApp.detected.value
|
|
133
|
-
else alert_objects.EventKind.
|
|
133
|
+
else alert_objects.EventKind.DATA_DRIFT_SUSPECTED
|
|
134
134
|
)
|
|
135
135
|
event_data = mlrun.common.schemas.Event(
|
|
136
136
|
kind=event_kind, entity=entity, value_dict=event_value
|
|
@@ -138,7 +138,7 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
138
138
|
mlrun.get_run_db().generate_event(event_kind, event_data)
|
|
139
139
|
|
|
140
140
|
@staticmethod
|
|
141
|
-
def _reconstruct_event(event: _RawEvent) -> tuple[_AppResultEvent,
|
|
141
|
+
def _reconstruct_event(event: _RawEvent) -> tuple[_AppResultEvent, WriterEventKind]:
|
|
142
142
|
"""
|
|
143
143
|
Modify the raw event into the expected monitoring application event
|
|
144
144
|
schema as defined in `mlrun.common.schemas.model_monitoring.constants.WriterEvent`
|
|
@@ -179,12 +179,13 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
179
179
|
def do(self, event: _RawEvent) -> None:
|
|
180
180
|
event, kind = self._reconstruct_event(event)
|
|
181
181
|
logger.info("Starting to write event", event=event)
|
|
182
|
-
|
|
183
182
|
self._tsdb_connector.write_application_event(event=event.copy(), kind=kind)
|
|
184
183
|
self._app_result_store.write_application_event(event=event.copy(), kind=kind)
|
|
184
|
+
|
|
185
185
|
logger.info("Completed event DB writes")
|
|
186
186
|
|
|
187
|
-
|
|
187
|
+
if kind == WriterEventKind.RESULT:
|
|
188
|
+
_Notifier(event=event, notification_pusher=self._custom_notifier).notify()
|
|
188
189
|
|
|
189
190
|
if (
|
|
190
191
|
mlrun.mlconf.alerts.mode == mlrun.common.schemas.alert.AlertsModes.enabled
|
|
@@ -208,7 +209,11 @@ class ModelMonitoringWriter(StepToDict):
|
|
|
208
209
|
"result_value": event[ResultData.RESULT_VALUE],
|
|
209
210
|
}
|
|
210
211
|
self._generate_event_on_drift(
|
|
211
|
-
|
|
212
|
+
get_result_instance_fqn(
|
|
213
|
+
event[WriterEvent.ENDPOINT_ID],
|
|
214
|
+
event[WriterEvent.APPLICATION_NAME],
|
|
215
|
+
event[ResultData.RESULT_NAME],
|
|
216
|
+
),
|
|
212
217
|
event[ResultData.RESULT_STATUS],
|
|
213
218
|
event_value,
|
|
214
219
|
self.project,
|
|
@@ -142,11 +142,11 @@ class _YAMLFormatter(_Formatter):
|
|
|
142
142
|
|
|
143
143
|
:param obj: The object to write.
|
|
144
144
|
:param file_path: The file path to write to.
|
|
145
|
-
:param dump_kwargs: Additional keyword arguments to pass to the `yaml.
|
|
145
|
+
:param dump_kwargs: Additional keyword arguments to pass to the `yaml.safe_dump` method of the formatter in use.
|
|
146
146
|
"""
|
|
147
147
|
dump_kwargs = dump_kwargs or cls.DEFAULT_DUMP_KWARGS
|
|
148
148
|
with open(file_path, "w") as file:
|
|
149
|
-
yaml.
|
|
149
|
+
yaml.safe_dump(obj, file, **dump_kwargs)
|
|
150
150
|
|
|
151
151
|
@classmethod
|
|
152
152
|
def read(cls, file_path: str) -> Union[list, dict]:
|
mlrun/projects/project.py
CHANGED
|
@@ -39,6 +39,7 @@ import yaml
|
|
|
39
39
|
from mlrun_pipelines.models import PipelineNodeWrapper
|
|
40
40
|
|
|
41
41
|
import mlrun.common.helpers
|
|
42
|
+
import mlrun.common.runtimes.constants
|
|
42
43
|
import mlrun.common.schemas.artifact
|
|
43
44
|
import mlrun.common.schemas.model_monitoring.constants as mm_constants
|
|
44
45
|
import mlrun.db
|
|
@@ -3098,17 +3099,18 @@ class MlrunProject(ModelObj):
|
|
|
3098
3099
|
|
|
3099
3100
|
def set_model_monitoring_credentials(
|
|
3100
3101
|
self,
|
|
3101
|
-
access_key: str = None,
|
|
3102
|
-
endpoint_store_connection: str = None,
|
|
3103
|
-
stream_path: str = None,
|
|
3102
|
+
access_key: Optional[str] = None,
|
|
3103
|
+
endpoint_store_connection: Optional[str] = None,
|
|
3104
|
+
stream_path: Optional[str] = None,
|
|
3105
|
+
tsdb_connection: Optional[str] = None,
|
|
3104
3106
|
):
|
|
3105
3107
|
"""Set the credentials that will be used by the project's model monitoring
|
|
3106
3108
|
infrastructure functions.
|
|
3107
3109
|
|
|
3108
|
-
:param access_key: Model Monitoring access key for managing user permissions
|
|
3109
3110
|
:param access_key: Model Monitoring access key for managing user permissions
|
|
3110
3111
|
:param endpoint_store_connection: Endpoint store connection string
|
|
3111
3112
|
:param stream_path: Path to the model monitoring stream
|
|
3113
|
+
:param tsdb_connection: Connection string to the time series database
|
|
3112
3114
|
"""
|
|
3113
3115
|
|
|
3114
3116
|
secrets_dict = {}
|
|
@@ -3131,6 +3133,16 @@ class MlrunProject(ModelObj):
|
|
|
3131
3133
|
mlrun.common.schemas.model_monitoring.ProjectSecretKeys.STREAM_PATH
|
|
3132
3134
|
] = stream_path
|
|
3133
3135
|
|
|
3136
|
+
if tsdb_connection:
|
|
3137
|
+
if not tsdb_connection.startswith("taosws://"):
|
|
3138
|
+
raise mlrun.errors.MLRunInvalidArgumentError(
|
|
3139
|
+
"Currently only TDEngine websocket connection is supported for non-v3io TSDB,"
|
|
3140
|
+
"please provide a full URL (e.g. taosws://user:password@host:port)"
|
|
3141
|
+
)
|
|
3142
|
+
secrets_dict[
|
|
3143
|
+
mlrun.common.schemas.model_monitoring.ProjectSecretKeys.TSDB_CONNECTION
|
|
3144
|
+
] = tsdb_connection
|
|
3145
|
+
|
|
3134
3146
|
self.set_secrets(
|
|
3135
3147
|
secrets=secrets_dict,
|
|
3136
3148
|
provider=mlrun.common.schemas.SecretProviderName.kubernetes,
|
|
@@ -3689,7 +3701,10 @@ class MlrunProject(ModelObj):
|
|
|
3689
3701
|
name: Optional[str] = None,
|
|
3690
3702
|
uid: Optional[Union[str, list[str]]] = None,
|
|
3691
3703
|
labels: Optional[Union[str, list[str]]] = None,
|
|
3692
|
-
state: Optional[
|
|
3704
|
+
state: Optional[
|
|
3705
|
+
mlrun.common.runtimes.constants.RunStates
|
|
3706
|
+
] = None, # Backward compatibility
|
|
3707
|
+
states: typing.Optional[list[mlrun.common.runtimes.constants.RunStates]] = None,
|
|
3693
3708
|
sort: bool = True,
|
|
3694
3709
|
last: int = 0,
|
|
3695
3710
|
iter: bool = False,
|
|
@@ -3723,10 +3738,11 @@ class MlrunProject(ModelObj):
|
|
|
3723
3738
|
:param labels: A list of labels to filter by. Label filters work by either filtering a specific value
|
|
3724
3739
|
of a label (i.e. list("key=value")) or by looking for the existence of a given
|
|
3725
3740
|
key (i.e. "key").
|
|
3726
|
-
:param state: List only runs whose state is specified.
|
|
3741
|
+
:param state: Deprecated - List only runs whose state is specified.
|
|
3742
|
+
:param states: List only runs whose state is one of the provided states.
|
|
3727
3743
|
:param sort: Whether to sort the result according to their start time. Otherwise, results will be
|
|
3728
3744
|
returned by their internal order in the DB (order will not be guaranteed).
|
|
3729
|
-
:param last: Deprecated - currently not used (will be removed in 1.
|
|
3745
|
+
:param last: Deprecated - currently not used (will be removed in 1.9.0).
|
|
3730
3746
|
:param iter: If ``True`` return runs from all iterations. Otherwise, return only runs whose ``iter`` is 0.
|
|
3731
3747
|
:param start_time_from: Filter by run start time in ``[start_time_from, start_time_to]``.
|
|
3732
3748
|
:param start_time_to: Filter by run start time in ``[start_time_from, start_time_to]``.
|
|
@@ -3734,13 +3750,22 @@ class MlrunProject(ModelObj):
|
|
|
3734
3750
|
last_update_time_to)``.
|
|
3735
3751
|
:param last_update_time_to: Filter by run last update time in ``(last_update_time_from, last_update_time_to)``.
|
|
3736
3752
|
"""
|
|
3753
|
+
if state:
|
|
3754
|
+
# TODO: Remove this in 1.9.0
|
|
3755
|
+
warnings.warn(
|
|
3756
|
+
"'state' is deprecated and will be removed in 1.9.0. Use 'states' instead.",
|
|
3757
|
+
FutureWarning,
|
|
3758
|
+
)
|
|
3759
|
+
|
|
3737
3760
|
db = mlrun.db.get_run_db(secrets=self._secrets)
|
|
3738
3761
|
return db.list_runs(
|
|
3739
3762
|
name,
|
|
3740
3763
|
uid,
|
|
3741
3764
|
self.metadata.name,
|
|
3742
3765
|
labels=labels,
|
|
3743
|
-
|
|
3766
|
+
states=mlrun.utils.helpers.as_list(state)
|
|
3767
|
+
if state is not None
|
|
3768
|
+
else states or None,
|
|
3744
3769
|
sort=sort,
|
|
3745
3770
|
last=last,
|
|
3746
3771
|
iter=iter,
|
mlrun/render.py
CHANGED
|
@@ -126,7 +126,7 @@ def artifacts_html(
|
|
|
126
126
|
|
|
127
127
|
if not attribute_value:
|
|
128
128
|
mlrun.utils.logger.warning(
|
|
129
|
-
"Artifact is
|
|
129
|
+
f"Artifact required attribute {attribute_name} is missing, omitting from output",
|
|
130
130
|
artifact_key=key,
|
|
131
131
|
)
|
|
132
132
|
continue
|
|
@@ -400,14 +400,17 @@ def runs_to_html(
|
|
|
400
400
|
else:
|
|
401
401
|
df["labels"] = df["labels"].apply(dict_html)
|
|
402
402
|
df["inputs"] = df["inputs"].apply(inputs_html)
|
|
403
|
-
if df["
|
|
404
|
-
df["artifact_uris"] = df["artifact_uris"].apply(dict_html)
|
|
405
|
-
df.drop("artifacts", axis=1, inplace=True)
|
|
406
|
-
else:
|
|
403
|
+
if df["artifacts"][0]:
|
|
407
404
|
df["artifacts"] = df["artifacts"].apply(
|
|
408
405
|
lambda artifacts: artifacts_html(artifacts, "target_path"),
|
|
409
406
|
)
|
|
410
407
|
df.drop("artifact_uris", axis=1, inplace=True)
|
|
408
|
+
elif df["artifact_uris"][0]:
|
|
409
|
+
df["artifact_uris"] = df["artifact_uris"].apply(dict_html)
|
|
410
|
+
df.drop("artifacts", axis=1, inplace=True)
|
|
411
|
+
else:
|
|
412
|
+
df.drop("artifacts", axis=1, inplace=True)
|
|
413
|
+
df.drop("artifact_uris", axis=1, inplace=True)
|
|
411
414
|
|
|
412
415
|
def expand_error(x):
|
|
413
416
|
if x["state"] == "error":
|
|
@@ -99,7 +99,7 @@ def save_credentials(
|
|
|
99
99
|
credentials["DATABRICKS_CLUSTER_ID"] = cluster_id
|
|
100
100
|
|
|
101
101
|
with open(credentials_path, "w") as yaml_file:
|
|
102
|
-
yaml.
|
|
102
|
+
yaml.safe_dump(credentials, yaml_file, default_flow_style=False)
|
|
103
103
|
|
|
104
104
|
|
|
105
105
|
def run_mlrun_databricks_job(
|
mlrun/utils/async_http.py
CHANGED
|
@@ -24,7 +24,7 @@ from aiohttp_retry import ExponentialRetry, RequestParams, RetryClient, RetryOpt
|
|
|
24
24
|
from aiohttp_retry.client import _RequestContext
|
|
25
25
|
|
|
26
26
|
from mlrun.config import config
|
|
27
|
-
from mlrun.errors import err_to_str
|
|
27
|
+
from mlrun.errors import err_to_str, raise_for_status
|
|
28
28
|
|
|
29
29
|
from .helpers import logger as mlrun_logger
|
|
30
30
|
|
|
@@ -46,12 +46,21 @@ class AsyncClientWithRetry(RetryClient):
|
|
|
46
46
|
*args,
|
|
47
47
|
**kwargs,
|
|
48
48
|
):
|
|
49
|
+
# do not retry on PUT / PATCH as they might have side effects (not truly idempotent)
|
|
50
|
+
blacklisted_methods = (
|
|
51
|
+
blacklisted_methods
|
|
52
|
+
if blacklisted_methods is not None
|
|
53
|
+
else [
|
|
54
|
+
"POST",
|
|
55
|
+
"PUT",
|
|
56
|
+
"PATCH",
|
|
57
|
+
]
|
|
58
|
+
)
|
|
49
59
|
super().__init__(
|
|
50
60
|
*args,
|
|
51
61
|
retry_options=ExponentialRetryOverride(
|
|
52
62
|
retry_on_exception=retry_on_exception,
|
|
53
|
-
|
|
54
|
-
blacklisted_methods=blacklisted_methods or ["POST", "PUT", "PATCH"],
|
|
63
|
+
blacklisted_methods=blacklisted_methods,
|
|
55
64
|
attempts=max_retries,
|
|
56
65
|
statuses=retry_on_status_codes,
|
|
57
66
|
factor=retry_backoff_factor,
|
|
@@ -63,6 +72,12 @@ class AsyncClientWithRetry(RetryClient):
|
|
|
63
72
|
**kwargs,
|
|
64
73
|
)
|
|
65
74
|
|
|
75
|
+
def methods_blacklist_update_required(self, new_blacklist: str):
|
|
76
|
+
self._retry_options: ExponentialRetryOverride
|
|
77
|
+
return set(self._retry_options.blacklisted_methods).difference(
|
|
78
|
+
set(new_blacklist)
|
|
79
|
+
)
|
|
80
|
+
|
|
66
81
|
def _make_requests(
|
|
67
82
|
self,
|
|
68
83
|
params_list: list[RequestParams],
|
|
@@ -173,7 +188,7 @@ class _CustomRequestContext(_RequestContext):
|
|
|
173
188
|
last_attempt = current_attempt == self._retry_options.attempts
|
|
174
189
|
if self._is_status_code_ok(response.status) or last_attempt:
|
|
175
190
|
if self._raise_for_status:
|
|
176
|
-
|
|
191
|
+
raise_for_status(response)
|
|
177
192
|
|
|
178
193
|
self._response = response
|
|
179
194
|
return response
|
|
@@ -275,6 +290,11 @@ class _CustomRequestContext(_RequestContext):
|
|
|
275
290
|
if isinstance(exc.os_error, exc_type):
|
|
276
291
|
return
|
|
277
292
|
if exc.__cause__:
|
|
278
|
-
return
|
|
293
|
+
# If the cause exception is retriable, return, otherwise, raise the original exception
|
|
294
|
+
try:
|
|
295
|
+
self.verify_exception_type(exc.__cause__)
|
|
296
|
+
except Exception:
|
|
297
|
+
raise exc
|
|
298
|
+
return
|
|
279
299
|
else:
|
|
280
300
|
raise exc
|
mlrun/utils/helpers.py
CHANGED
|
@@ -973,6 +973,15 @@ def get_ui_url(project, uid=None):
|
|
|
973
973
|
return url
|
|
974
974
|
|
|
975
975
|
|
|
976
|
+
def get_model_endpoint_url(project, model_name, model_endpoint_id):
|
|
977
|
+
url = ""
|
|
978
|
+
if mlrun.mlconf.resolve_ui_url():
|
|
979
|
+
url = f"{mlrun.mlconf.resolve_ui_url()}/{mlrun.mlconf.ui.projects_prefix}/{project}/models"
|
|
980
|
+
if model_name:
|
|
981
|
+
url += f"/model-endpoints/{model_name}/{model_endpoint_id}/overview"
|
|
982
|
+
return url
|
|
983
|
+
|
|
984
|
+
|
|
976
985
|
def get_workflow_url(project, id=None):
|
|
977
986
|
url = ""
|
|
978
987
|
if mlrun.mlconf.resolve_ui_url():
|
|
@@ -1183,7 +1192,7 @@ def calculate_dataframe_hash(dataframe: pandas.DataFrame):
|
|
|
1183
1192
|
return hashlib.sha1(pandas.util.hash_pandas_object(dataframe).values).hexdigest()
|
|
1184
1193
|
|
|
1185
1194
|
|
|
1186
|
-
def template_artifact_path(artifact_path, project, run_uid=
|
|
1195
|
+
def template_artifact_path(artifact_path, project, run_uid=None):
|
|
1187
1196
|
"""
|
|
1188
1197
|
Replace {{run.uid}} with the run uid and {{project}} with the project name in the artifact path.
|
|
1189
1198
|
If no run uid is provided, the word `project` will be used instead as it is assumed to be a project
|
|
@@ -1191,6 +1200,7 @@ def template_artifact_path(artifact_path, project, run_uid="project"):
|
|
|
1191
1200
|
"""
|
|
1192
1201
|
if not artifact_path:
|
|
1193
1202
|
return artifact_path
|
|
1203
|
+
run_uid = run_uid or "project"
|
|
1194
1204
|
artifact_path = artifact_path.replace("{{run.uid}}", run_uid)
|
|
1195
1205
|
artifact_path = _fill_project_path_template(artifact_path, project)
|
|
1196
1206
|
return artifact_path
|
|
@@ -1603,3 +1613,12 @@ def validate_component_version_compatibility(
|
|
|
1603
1613
|
if parsed_current_version < parsed_min_version:
|
|
1604
1614
|
return False
|
|
1605
1615
|
return True
|
|
1616
|
+
|
|
1617
|
+
|
|
1618
|
+
def format_alert_summary(
|
|
1619
|
+
alert: mlrun.common.schemas.AlertConfig, event_data: mlrun.common.schemas.Event
|
|
1620
|
+
) -> str:
|
|
1621
|
+
result = alert.summary.replace("{{project}}", alert.project)
|
|
1622
|
+
result = result.replace("{{name}}", alert.name)
|
|
1623
|
+
result = result.replace("{{entity}}", event_data.entity.ids[0])
|
|
1624
|
+
return result
|
|
@@ -32,6 +32,7 @@ class SlackNotification(NotificationBase):
|
|
|
32
32
|
"completed": ":smiley:",
|
|
33
33
|
"running": ":man-running:",
|
|
34
34
|
"error": ":x:",
|
|
35
|
+
"skipped": ":zzz:",
|
|
35
36
|
}
|
|
36
37
|
|
|
37
38
|
async def push(
|
|
@@ -135,8 +136,16 @@ class SlackNotification(NotificationBase):
|
|
|
135
136
|
line = [
|
|
136
137
|
self._get_slack_row(f":bell: {alert.name} alert has occurred"),
|
|
137
138
|
self._get_slack_row(f"*Project:*\n{alert.project}"),
|
|
138
|
-
self._get_slack_row(f"*
|
|
139
|
+
self._get_slack_row(f"*ID:*\n{event_data.entity.ids[0]}"),
|
|
139
140
|
]
|
|
141
|
+
|
|
142
|
+
if alert.summary:
|
|
143
|
+
line.append(
|
|
144
|
+
self._get_slack_row(
|
|
145
|
+
f"*Summary:*\n{mlrun.utils.helpers.format_alert_summary(alert, event_data)}"
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
|
|
140
149
|
if event_data.value_dict:
|
|
141
150
|
data_lines = []
|
|
142
151
|
for key, value in event_data.value_dict.items():
|
|
@@ -144,10 +153,21 @@ class SlackNotification(NotificationBase):
|
|
|
144
153
|
data_text = "\n".join(data_lines)
|
|
145
154
|
line.append(self._get_slack_row(f"*Event data:*\n{data_text}"))
|
|
146
155
|
|
|
147
|
-
if
|
|
148
|
-
|
|
149
|
-
):
|
|
150
|
-
|
|
156
|
+
if (
|
|
157
|
+
event_data.entity.kind == mlrun.common.schemas.alert.EventEntityKind.JOB
|
|
158
|
+
): # JOB entity
|
|
159
|
+
uid = event_data.value_dict.get("uid")
|
|
160
|
+
url = mlrun.utils.helpers.get_ui_url(alert.project, uid)
|
|
161
|
+
overview_type = "Job overview"
|
|
162
|
+
else: # MODEL entity
|
|
163
|
+
model_name = event_data.value_dict.get("model")
|
|
164
|
+
model_endpoint_id = event_data.value_dict.get("model_endpoint_id")
|
|
165
|
+
url = mlrun.utils.helpers.get_model_endpoint_url(
|
|
166
|
+
alert.project, model_name, model_endpoint_id
|
|
167
|
+
)
|
|
168
|
+
overview_type = "Model endpoint"
|
|
169
|
+
|
|
170
|
+
line.append(self._get_slack_row(f"*Overview:*\n<{url}|*{overview_type}*>"))
|
|
151
171
|
|
|
152
172
|
return line
|
|
153
173
|
|
|
@@ -157,11 +177,11 @@ class SlackNotification(NotificationBase):
|
|
|
157
177
|
|
|
158
178
|
# Only show the URL if the run is not a function (serving or mlrun function)
|
|
159
179
|
kind = run.get("step_kind")
|
|
160
|
-
|
|
180
|
+
state = run["status"].get("state", "")
|
|
181
|
+
if state != "skipped" and (url and not kind or kind == "run"):
|
|
161
182
|
line = f'<{url}|*{meta.get("name")}*>'
|
|
162
183
|
else:
|
|
163
184
|
line = meta.get("name")
|
|
164
|
-
state = run["status"].get("state", "")
|
|
165
185
|
if kind:
|
|
166
186
|
line = f'{line} *({run.get("step_kind", run.get("kind", ""))})*'
|
|
167
187
|
line = f'{self.emojis.get(state, ":question:")} {line}'
|
|
@@ -14,7 +14,6 @@
|
|
|
14
14
|
|
|
15
15
|
import asyncio
|
|
16
16
|
import datetime
|
|
17
|
-
import json
|
|
18
17
|
import os
|
|
19
18
|
import re
|
|
20
19
|
import traceback
|
|
@@ -23,6 +22,7 @@ from concurrent.futures import ThreadPoolExecutor
|
|
|
23
22
|
|
|
24
23
|
import kfp
|
|
25
24
|
import mlrun_pipelines.common.ops
|
|
25
|
+
import mlrun_pipelines.models
|
|
26
26
|
|
|
27
27
|
import mlrun.common.runtimes.constants
|
|
28
28
|
import mlrun.common.schemas
|
|
@@ -392,17 +392,29 @@ class NotificationPusher(_NotificationPusherBase):
|
|
|
392
392
|
steps = []
|
|
393
393
|
db = mlrun.get_run_db()
|
|
394
394
|
|
|
395
|
-
def _add_run_step(
|
|
396
|
-
|
|
397
|
-
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
395
|
+
def _add_run_step(_step: mlrun_pipelines.models.PipelineStep):
|
|
396
|
+
try:
|
|
397
|
+
_run = db.list_runs(
|
|
398
|
+
project=run.metadata.project,
|
|
399
|
+
labels=f"mlrun/runner-pod={_step.node_name}",
|
|
400
|
+
)[0]
|
|
401
|
+
except IndexError:
|
|
402
|
+
_run = {
|
|
403
|
+
"metadata": {
|
|
404
|
+
"name": _step.display_name,
|
|
405
|
+
"project": run.metadata.project,
|
|
406
|
+
},
|
|
407
|
+
}
|
|
408
|
+
_run["step_kind"] = _step.step_type
|
|
409
|
+
if _step.skipped:
|
|
410
|
+
_run.setdefault("status", {})["state"] = (
|
|
411
|
+
mlrun.common.runtimes.constants.RunStates.skipped
|
|
412
|
+
)
|
|
401
413
|
steps.append(_run)
|
|
402
414
|
|
|
403
|
-
def _add_deploy_function_step(
|
|
415
|
+
def _add_deploy_function_step(_step: mlrun_pipelines.models.PipelineStep):
|
|
404
416
|
project, name, hash_key = self._extract_function_uri(
|
|
405
|
-
|
|
417
|
+
_step.get_annotation("mlrun/function-uri")
|
|
406
418
|
)
|
|
407
419
|
if name:
|
|
408
420
|
try:
|
|
@@ -419,16 +431,19 @@ class NotificationPusher(_NotificationPusherBase):
|
|
|
419
431
|
"hash_key": hash_key,
|
|
420
432
|
},
|
|
421
433
|
}
|
|
422
|
-
|
|
423
|
-
|
|
424
|
-
|
|
425
|
-
|
|
426
|
-
|
|
434
|
+
pod_phase = _step.phase
|
|
435
|
+
if _step.skipped:
|
|
436
|
+
state = mlrun.common.schemas.FunctionState.skipped
|
|
437
|
+
else:
|
|
438
|
+
state = mlrun.common.runtimes.constants.PodPhases.pod_phase_to_run_state(
|
|
439
|
+
pod_phase
|
|
440
|
+
)
|
|
441
|
+
function["status"] = {"state": state}
|
|
427
442
|
if isinstance(function["metadata"].get("updated"), datetime.datetime):
|
|
428
443
|
function["metadata"]["updated"] = function["metadata"][
|
|
429
444
|
"updated"
|
|
430
445
|
].isoformat()
|
|
431
|
-
function["step_kind"] =
|
|
446
|
+
function["step_kind"] = _step.step_type
|
|
432
447
|
steps.append(function)
|
|
433
448
|
|
|
434
449
|
step_methods = {
|
|
@@ -446,26 +461,10 @@ class NotificationPusher(_NotificationPusherBase):
|
|
|
446
461
|
return steps
|
|
447
462
|
|
|
448
463
|
try:
|
|
449
|
-
|
|
450
|
-
|
|
451
|
-
key=lambda _node: _node[1]["finishedAt"],
|
|
452
|
-
)
|
|
453
|
-
for node_name, node in workflow_nodes:
|
|
454
|
-
if node["type"] != "Pod":
|
|
455
|
-
# Skip the parent DAG node
|
|
456
|
-
continue
|
|
457
|
-
|
|
458
|
-
node_template = next(
|
|
459
|
-
template
|
|
460
|
-
for template in workflow_manifest["spec"]["templates"]
|
|
461
|
-
if template["name"] == node["templateName"]
|
|
462
|
-
)
|
|
463
|
-
step_type = node_template["metadata"]["annotations"].get(
|
|
464
|
-
"mlrun/pipeline-step-type"
|
|
465
|
-
)
|
|
466
|
-
step_method = step_methods.get(step_type)
|
|
464
|
+
for step in workflow_manifest.get_steps():
|
|
465
|
+
step_method = step_methods.get(step.step_type)
|
|
467
466
|
if step_method:
|
|
468
|
-
step_method(
|
|
467
|
+
step_method(step)
|
|
469
468
|
return steps
|
|
470
469
|
except Exception:
|
|
471
470
|
# If we fail to read the pipeline steps, we will return the list of runs that have the same workflow id
|
|
@@ -481,7 +480,9 @@ class NotificationPusher(_NotificationPusherBase):
|
|
|
481
480
|
)
|
|
482
481
|
|
|
483
482
|
@staticmethod
|
|
484
|
-
def _get_workflow_manifest(
|
|
483
|
+
def _get_workflow_manifest(
|
|
484
|
+
workflow_id: str,
|
|
485
|
+
) -> typing.Optional[mlrun_pipelines.models.PipelineManifest]:
|
|
485
486
|
kfp_url = mlrun.mlconf.resolve_kfp_url(mlrun.mlconf.namespace)
|
|
486
487
|
if not kfp_url:
|
|
487
488
|
raise mlrun.errors.MLRunNotFoundError(
|
|
@@ -495,11 +496,8 @@ class NotificationPusher(_NotificationPusherBase):
|
|
|
495
496
|
if not kfp_run:
|
|
496
497
|
return None
|
|
497
498
|
|
|
498
|
-
kfp_run =
|
|
499
|
-
|
|
500
|
-
return json.loads(kfp_run["pipeline_runtime"]["workflow_manifest"])
|
|
501
|
-
except Exception:
|
|
502
|
-
return None
|
|
499
|
+
kfp_run = mlrun_pipelines.models.PipelineRun(kfp_run)
|
|
500
|
+
return kfp_run.workflow_manifest()
|
|
503
501
|
|
|
504
502
|
def _extract_function_uri(self, function_uri: str) -> tuple[str, str, str]:
|
|
505
503
|
"""
|
mlrun/utils/version/version.json
CHANGED