mlrun 1.7.0rc4__py3-none-any.whl → 1.7.0rc20__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of mlrun might be problematic. Click here for more details.
- mlrun/__init__.py +11 -1
- mlrun/__main__.py +25 -111
- mlrun/{datastore/helpers.py → alerts/__init__.py} +2 -5
- mlrun/alerts/alert.py +144 -0
- mlrun/api/schemas/__init__.py +4 -3
- mlrun/artifacts/__init__.py +8 -3
- mlrun/artifacts/base.py +38 -254
- mlrun/artifacts/dataset.py +9 -190
- mlrun/artifacts/manager.py +41 -47
- mlrun/artifacts/model.py +30 -158
- mlrun/artifacts/plots.py +23 -380
- mlrun/common/constants.py +68 -0
- mlrun/common/formatters/__init__.py +19 -0
- mlrun/{model_monitoring/stores/models/sqlite.py → common/formatters/artifact.py} +6 -8
- mlrun/common/formatters/base.py +78 -0
- mlrun/common/formatters/function.py +41 -0
- mlrun/common/formatters/pipeline.py +53 -0
- mlrun/common/formatters/project.py +51 -0
- mlrun/{runtimes → common/runtimes}/constants.py +32 -4
- mlrun/common/schemas/__init__.py +25 -4
- mlrun/common/schemas/alert.py +203 -0
- mlrun/common/schemas/api_gateway.py +148 -0
- mlrun/common/schemas/artifact.py +15 -5
- mlrun/common/schemas/auth.py +8 -2
- mlrun/common/schemas/client_spec.py +2 -0
- mlrun/common/schemas/frontend_spec.py +1 -0
- mlrun/common/schemas/function.py +4 -0
- mlrun/common/schemas/hub.py +7 -9
- mlrun/common/schemas/model_monitoring/__init__.py +19 -3
- mlrun/common/schemas/model_monitoring/constants.py +96 -26
- mlrun/common/schemas/model_monitoring/grafana.py +9 -5
- mlrun/common/schemas/model_monitoring/model_endpoints.py +86 -2
- mlrun/{runtimes/mpijob/v1alpha1.py → common/schemas/pagination.py} +10 -13
- mlrun/common/schemas/pipeline.py +0 -9
- mlrun/common/schemas/project.py +22 -21
- mlrun/common/types.py +7 -1
- mlrun/config.py +87 -19
- mlrun/data_types/data_types.py +4 -0
- mlrun/data_types/to_pandas.py +9 -9
- mlrun/datastore/__init__.py +5 -8
- mlrun/datastore/alibaba_oss.py +130 -0
- mlrun/datastore/azure_blob.py +4 -5
- mlrun/datastore/base.py +69 -30
- mlrun/datastore/datastore.py +10 -2
- mlrun/datastore/datastore_profile.py +90 -6
- mlrun/datastore/google_cloud_storage.py +1 -1
- mlrun/datastore/hdfs.py +5 -0
- mlrun/datastore/inmem.py +2 -2
- mlrun/datastore/redis.py +2 -2
- mlrun/datastore/s3.py +5 -0
- mlrun/datastore/snowflake_utils.py +43 -0
- mlrun/datastore/sources.py +172 -44
- mlrun/datastore/store_resources.py +7 -7
- mlrun/datastore/targets.py +285 -41
- mlrun/datastore/utils.py +68 -5
- mlrun/datastore/v3io.py +27 -50
- mlrun/db/auth_utils.py +152 -0
- mlrun/db/base.py +149 -14
- mlrun/db/factory.py +1 -1
- mlrun/db/httpdb.py +608 -178
- mlrun/db/nopdb.py +191 -7
- mlrun/errors.py +11 -0
- mlrun/execution.py +37 -20
- mlrun/feature_store/__init__.py +0 -2
- mlrun/feature_store/api.py +21 -52
- mlrun/feature_store/feature_set.py +48 -23
- mlrun/feature_store/feature_vector.py +2 -1
- mlrun/feature_store/ingestion.py +7 -6
- mlrun/feature_store/retrieval/base.py +9 -4
- mlrun/feature_store/retrieval/conversion.py +9 -9
- mlrun/feature_store/retrieval/dask_merger.py +2 -0
- mlrun/feature_store/retrieval/job.py +9 -3
- mlrun/feature_store/retrieval/local_merger.py +2 -0
- mlrun/feature_store/retrieval/spark_merger.py +34 -24
- mlrun/feature_store/steps.py +30 -19
- mlrun/features.py +4 -13
- mlrun/frameworks/_dl_common/loggers/tensorboard_logger.py +7 -12
- mlrun/frameworks/auto_mlrun/auto_mlrun.py +2 -2
- mlrun/frameworks/lgbm/__init__.py +1 -1
- mlrun/frameworks/lgbm/callbacks/callback.py +2 -4
- mlrun/frameworks/lgbm/model_handler.py +1 -1
- mlrun/frameworks/parallel_coordinates.py +2 -1
- mlrun/frameworks/pytorch/__init__.py +2 -2
- mlrun/frameworks/sklearn/__init__.py +1 -1
- mlrun/frameworks/tf_keras/__init__.py +5 -2
- mlrun/frameworks/tf_keras/callbacks/logging_callback.py +1 -1
- mlrun/frameworks/tf_keras/mlrun_interface.py +2 -2
- mlrun/frameworks/xgboost/__init__.py +1 -1
- mlrun/k8s_utils.py +10 -11
- mlrun/launcher/__init__.py +1 -1
- mlrun/launcher/base.py +6 -5
- mlrun/launcher/client.py +8 -6
- mlrun/launcher/factory.py +1 -1
- mlrun/launcher/local.py +9 -3
- mlrun/launcher/remote.py +9 -3
- mlrun/lists.py +6 -2
- mlrun/model.py +58 -19
- mlrun/model_monitoring/__init__.py +1 -1
- mlrun/model_monitoring/api.py +127 -301
- mlrun/model_monitoring/application.py +5 -296
- mlrun/model_monitoring/applications/__init__.py +11 -0
- mlrun/model_monitoring/applications/_application_steps.py +157 -0
- mlrun/model_monitoring/applications/base.py +282 -0
- mlrun/model_monitoring/applications/context.py +214 -0
- mlrun/model_monitoring/applications/evidently_base.py +211 -0
- mlrun/model_monitoring/applications/histogram_data_drift.py +224 -93
- mlrun/model_monitoring/applications/results.py +99 -0
- mlrun/model_monitoring/controller.py +30 -36
- mlrun/model_monitoring/db/__init__.py +18 -0
- mlrun/model_monitoring/{stores → db/stores}/__init__.py +43 -36
- mlrun/model_monitoring/db/stores/base/__init__.py +15 -0
- mlrun/model_monitoring/{stores/model_endpoint_store.py → db/stores/base/store.py} +58 -32
- mlrun/model_monitoring/db/stores/sqldb/__init__.py +13 -0
- mlrun/model_monitoring/db/stores/sqldb/models/__init__.py +71 -0
- mlrun/model_monitoring/{stores → db/stores/sqldb}/models/base.py +109 -5
- mlrun/model_monitoring/db/stores/sqldb/models/mysql.py +88 -0
- mlrun/model_monitoring/{stores/models/mysql.py → db/stores/sqldb/models/sqlite.py} +19 -13
- mlrun/model_monitoring/db/stores/sqldb/sql_store.py +684 -0
- mlrun/model_monitoring/db/stores/v3io_kv/__init__.py +13 -0
- mlrun/model_monitoring/{stores/kv_model_endpoint_store.py → db/stores/v3io_kv/kv_store.py} +302 -155
- mlrun/model_monitoring/db/tsdb/__init__.py +100 -0
- mlrun/model_monitoring/db/tsdb/base.py +329 -0
- mlrun/model_monitoring/db/tsdb/helpers.py +30 -0
- mlrun/model_monitoring/db/tsdb/tdengine/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/tdengine/schemas.py +240 -0
- mlrun/model_monitoring/db/tsdb/tdengine/stream_graph_steps.py +45 -0
- mlrun/model_monitoring/db/tsdb/tdengine/tdengine_connector.py +397 -0
- mlrun/model_monitoring/db/tsdb/v3io/__init__.py +15 -0
- mlrun/model_monitoring/db/tsdb/v3io/stream_graph_steps.py +117 -0
- mlrun/model_monitoring/db/tsdb/v3io/v3io_connector.py +630 -0
- mlrun/model_monitoring/evidently_application.py +6 -118
- mlrun/model_monitoring/features_drift_table.py +34 -22
- mlrun/model_monitoring/helpers.py +100 -7
- mlrun/model_monitoring/model_endpoint.py +3 -2
- mlrun/model_monitoring/stream_processing.py +93 -228
- mlrun/model_monitoring/tracking_policy.py +7 -1
- mlrun/model_monitoring/writer.py +152 -124
- mlrun/package/packagers_manager.py +1 -0
- mlrun/package/utils/_formatter.py +2 -2
- mlrun/platforms/__init__.py +11 -10
- mlrun/platforms/iguazio.py +21 -202
- mlrun/projects/operations.py +30 -16
- mlrun/projects/pipelines.py +92 -99
- mlrun/projects/project.py +757 -268
- mlrun/render.py +15 -14
- mlrun/run.py +160 -162
- mlrun/runtimes/__init__.py +55 -3
- mlrun/runtimes/base.py +33 -19
- mlrun/runtimes/databricks_job/databricks_wrapper.py +1 -1
- mlrun/runtimes/funcdoc.py +0 -28
- mlrun/runtimes/kubejob.py +28 -122
- mlrun/runtimes/local.py +5 -2
- mlrun/runtimes/mpijob/__init__.py +0 -20
- mlrun/runtimes/mpijob/abstract.py +8 -8
- mlrun/runtimes/mpijob/v1.py +1 -1
- mlrun/runtimes/nuclio/__init__.py +1 -0
- mlrun/runtimes/nuclio/api_gateway.py +709 -0
- mlrun/runtimes/nuclio/application/__init__.py +15 -0
- mlrun/runtimes/nuclio/application/application.py +523 -0
- mlrun/runtimes/nuclio/application/reverse_proxy.go +95 -0
- mlrun/runtimes/nuclio/function.py +98 -58
- mlrun/runtimes/nuclio/serving.py +36 -42
- mlrun/runtimes/pod.py +196 -45
- mlrun/runtimes/remotesparkjob.py +1 -1
- mlrun/runtimes/sparkjob/spark3job.py +1 -1
- mlrun/runtimes/utils.py +6 -73
- mlrun/secrets.py +6 -2
- mlrun/serving/remote.py +2 -3
- mlrun/serving/routers.py +7 -4
- mlrun/serving/server.py +7 -8
- mlrun/serving/states.py +73 -43
- mlrun/serving/v2_serving.py +8 -7
- mlrun/track/tracker.py +2 -1
- mlrun/utils/async_http.py +25 -5
- mlrun/utils/helpers.py +141 -75
- mlrun/utils/http.py +1 -1
- mlrun/utils/logger.py +39 -7
- mlrun/utils/notifications/notification/__init__.py +14 -9
- mlrun/utils/notifications/notification/base.py +12 -0
- mlrun/utils/notifications/notification/console.py +2 -0
- mlrun/utils/notifications/notification/git.py +3 -1
- mlrun/utils/notifications/notification/ipython.py +2 -0
- mlrun/utils/notifications/notification/slack.py +101 -21
- mlrun/utils/notifications/notification/webhook.py +11 -1
- mlrun/utils/notifications/notification_pusher.py +147 -16
- mlrun/utils/retryer.py +3 -2
- mlrun/utils/v3io_clients.py +0 -1
- mlrun/utils/version/version.json +2 -2
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/METADATA +33 -18
- mlrun-1.7.0rc20.dist-info/RECORD +353 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/WHEEL +1 -1
- mlrun/kfpops.py +0 -868
- mlrun/model_monitoring/batch.py +0 -974
- mlrun/model_monitoring/stores/models/__init__.py +0 -27
- mlrun/model_monitoring/stores/sql_model_endpoint_store.py +0 -382
- mlrun/platforms/other.py +0 -305
- mlrun-1.7.0rc4.dist-info/RECORD +0 -321
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/LICENSE +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/entry_points.txt +0 -0
- {mlrun-1.7.0rc4.dist-info → mlrun-1.7.0rc20.dist-info}/top_level.txt +0 -0
mlrun/utils/logger.py
CHANGED
|
@@ -13,6 +13,7 @@
|
|
|
13
13
|
# limitations under the License.
|
|
14
14
|
|
|
15
15
|
import logging
|
|
16
|
+
import typing
|
|
16
17
|
from enum import Enum
|
|
17
18
|
from sys import stdout
|
|
18
19
|
from traceback import format_exception
|
|
@@ -92,7 +93,25 @@ class HumanReadableFormatter(_BaseFormatter):
|
|
|
92
93
|
|
|
93
94
|
class HumanReadableExtendedFormatter(HumanReadableFormatter):
|
|
94
95
|
def format(self, record) -> str:
|
|
95
|
-
more =
|
|
96
|
+
more = ""
|
|
97
|
+
record_with = self._record_with(record)
|
|
98
|
+
if record_with:
|
|
99
|
+
|
|
100
|
+
def _format_value(val):
|
|
101
|
+
formatted_val = (
|
|
102
|
+
val
|
|
103
|
+
if isinstance(val, str)
|
|
104
|
+
else str(orjson.loads(self._json_dump(val)))
|
|
105
|
+
)
|
|
106
|
+
return (
|
|
107
|
+
formatted_val.replace("\n", "\n\t\t")
|
|
108
|
+
if len(formatted_val) < 4096
|
|
109
|
+
else repr(formatted_val)
|
|
110
|
+
)
|
|
111
|
+
|
|
112
|
+
more = "\n\t" + "\n\t".join(
|
|
113
|
+
[f"{key}: {_format_value(val)}" for key, val in record_with.items()]
|
|
114
|
+
)
|
|
96
115
|
return (
|
|
97
116
|
"> "
|
|
98
117
|
f"{self.formatTime(record, self.datefmt)} "
|
|
@@ -221,14 +240,27 @@ class FormatterKinds(Enum):
|
|
|
221
240
|
JSON = "json"
|
|
222
241
|
|
|
223
242
|
|
|
224
|
-
def
|
|
243
|
+
def resolve_formatter_by_kind(
|
|
244
|
+
formatter_kind: FormatterKinds,
|
|
245
|
+
) -> type[
|
|
246
|
+
typing.Union[HumanReadableFormatter, HumanReadableExtendedFormatter, JSONFormatter]
|
|
247
|
+
]:
|
|
225
248
|
return {
|
|
226
|
-
FormatterKinds.HUMAN: HumanReadableFormatter
|
|
227
|
-
FormatterKinds.HUMAN_EXTENDED: HumanReadableExtendedFormatter
|
|
228
|
-
FormatterKinds.JSON: JSONFormatter
|
|
249
|
+
FormatterKinds.HUMAN: HumanReadableFormatter,
|
|
250
|
+
FormatterKinds.HUMAN_EXTENDED: HumanReadableExtendedFormatter,
|
|
251
|
+
FormatterKinds.JSON: JSONFormatter,
|
|
229
252
|
}[formatter_kind]
|
|
230
253
|
|
|
231
254
|
|
|
255
|
+
def create_test_logger(name: str = "mlrun", stream: IO[str] = stdout) -> Logger:
|
|
256
|
+
return create_logger(
|
|
257
|
+
level="debug",
|
|
258
|
+
formatter_kind=FormatterKinds.HUMAN_EXTENDED.name,
|
|
259
|
+
name=name,
|
|
260
|
+
stream=stream,
|
|
261
|
+
)
|
|
262
|
+
|
|
263
|
+
|
|
232
264
|
def create_logger(
|
|
233
265
|
level: Optional[str] = None,
|
|
234
266
|
formatter_kind: str = FormatterKinds.HUMAN.name,
|
|
@@ -243,11 +275,11 @@ def create_logger(
|
|
|
243
275
|
logger_instance = Logger(level, name=name, propagate=False)
|
|
244
276
|
|
|
245
277
|
# resolve formatter
|
|
246
|
-
formatter_instance =
|
|
278
|
+
formatter_instance = resolve_formatter_by_kind(
|
|
247
279
|
FormatterKinds(formatter_kind.lower())
|
|
248
280
|
)
|
|
249
281
|
|
|
250
282
|
# set handler
|
|
251
|
-
logger_instance.set_handler("default", stream or stdout, formatter_instance)
|
|
283
|
+
logger_instance.set_handler("default", stream or stdout, formatter_instance())
|
|
252
284
|
|
|
253
285
|
return logger_instance
|
|
@@ -51,14 +51,19 @@ class NotificationTypes(str, enum.Enum):
|
|
|
51
51
|
self.console: [self.ipython],
|
|
52
52
|
}.get(self, [])
|
|
53
53
|
|
|
54
|
+
@classmethod
|
|
55
|
+
def local(cls) -> list[str]:
|
|
56
|
+
return [
|
|
57
|
+
cls.console,
|
|
58
|
+
cls.ipython,
|
|
59
|
+
]
|
|
60
|
+
|
|
54
61
|
@classmethod
|
|
55
62
|
def all(cls) -> list[str]:
|
|
56
|
-
return
|
|
57
|
-
|
|
58
|
-
|
|
59
|
-
|
|
60
|
-
|
|
61
|
-
|
|
62
|
-
|
|
63
|
-
]
|
|
64
|
-
)
|
|
63
|
+
return [
|
|
64
|
+
cls.console,
|
|
65
|
+
cls.git,
|
|
66
|
+
cls.ipython,
|
|
67
|
+
cls.slack,
|
|
68
|
+
cls.webhook,
|
|
69
|
+
]
|
|
@@ -44,6 +44,8 @@ class NotificationBase:
|
|
|
44
44
|
] = mlrun.common.schemas.NotificationSeverity.INFO,
|
|
45
45
|
runs: typing.Union[mlrun.lists.RunList, list] = None,
|
|
46
46
|
custom_html: str = None,
|
|
47
|
+
alert: mlrun.common.schemas.AlertConfig = None,
|
|
48
|
+
event_data: mlrun.common.schemas.Event = None,
|
|
47
49
|
):
|
|
48
50
|
raise NotImplementedError()
|
|
49
51
|
|
|
@@ -61,6 +63,8 @@ class NotificationBase:
|
|
|
61
63
|
] = mlrun.common.schemas.NotificationSeverity.INFO,
|
|
62
64
|
runs: typing.Union[mlrun.lists.RunList, list] = None,
|
|
63
65
|
custom_html: str = None,
|
|
66
|
+
alert: mlrun.common.schemas.AlertConfig = None,
|
|
67
|
+
event_data: mlrun.common.schemas.Event = None,
|
|
64
68
|
) -> str:
|
|
65
69
|
if custom_html:
|
|
66
70
|
return custom_html
|
|
@@ -68,6 +72,14 @@ class NotificationBase:
|
|
|
68
72
|
if self.name:
|
|
69
73
|
message = f"{self.name}: {message}"
|
|
70
74
|
|
|
75
|
+
if alert:
|
|
76
|
+
if not event_data:
|
|
77
|
+
return f"[{severity}] {message}"
|
|
78
|
+
return (
|
|
79
|
+
f"[{severity}] {message} for project {alert.project} "
|
|
80
|
+
f"UID {event_data.entity.ids[0]}. Values {event_data.value_dict}"
|
|
81
|
+
)
|
|
82
|
+
|
|
71
83
|
if not runs:
|
|
72
84
|
return f"[{severity}] {message}"
|
|
73
85
|
|
|
@@ -36,6 +36,8 @@ class ConsoleNotification(NotificationBase):
|
|
|
36
36
|
] = mlrun.common.schemas.NotificationSeverity.INFO,
|
|
37
37
|
runs: typing.Union[mlrun.lists.RunList, list] = None,
|
|
38
38
|
custom_html: str = None,
|
|
39
|
+
alert: mlrun.common.schemas.AlertConfig = None,
|
|
40
|
+
event_data: mlrun.common.schemas.Event = None,
|
|
39
41
|
):
|
|
40
42
|
severity = self._resolve_severity(severity)
|
|
41
43
|
print(f"[{severity}] {message}")
|
|
@@ -38,6 +38,8 @@ class GitNotification(NotificationBase):
|
|
|
38
38
|
] = mlrun.common.schemas.NotificationSeverity.INFO,
|
|
39
39
|
runs: typing.Union[mlrun.lists.RunList, list] = None,
|
|
40
40
|
custom_html: str = None,
|
|
41
|
+
alert: mlrun.common.schemas.AlertConfig = None,
|
|
42
|
+
event_data: mlrun.common.schemas.Event = None,
|
|
41
43
|
):
|
|
42
44
|
git_repo = self.params.get("repo", None)
|
|
43
45
|
git_issue = self.params.get("issue", None)
|
|
@@ -50,7 +52,7 @@ class GitNotification(NotificationBase):
|
|
|
50
52
|
server = self.params.get("server", None)
|
|
51
53
|
gitlab = self.params.get("gitlab", False)
|
|
52
54
|
await self._pr_comment(
|
|
53
|
-
self._get_html(message, severity, runs, custom_html),
|
|
55
|
+
self._get_html(message, severity, runs, custom_html, alert, event_data),
|
|
54
56
|
git_repo,
|
|
55
57
|
git_issue,
|
|
56
58
|
merge_request=git_merge_request,
|
|
@@ -53,6 +53,8 @@ class IPythonNotification(NotificationBase):
|
|
|
53
53
|
] = mlrun.common.schemas.NotificationSeverity.INFO,
|
|
54
54
|
runs: typing.Union[mlrun.lists.RunList, list] = None,
|
|
55
55
|
custom_html: str = None,
|
|
56
|
+
alert: mlrun.common.schemas.AlertConfig = None,
|
|
57
|
+
event_data: mlrun.common.schemas.Event = None,
|
|
56
58
|
):
|
|
57
59
|
if not self._ipython:
|
|
58
60
|
mlrun.utils.helpers.logger.debug(
|
|
@@ -32,6 +32,7 @@ class SlackNotification(NotificationBase):
|
|
|
32
32
|
"completed": ":smiley:",
|
|
33
33
|
"running": ":man-running:",
|
|
34
34
|
"error": ":x:",
|
|
35
|
+
"skipped": ":zzz:",
|
|
35
36
|
}
|
|
36
37
|
|
|
37
38
|
async def push(
|
|
@@ -42,6 +43,8 @@ class SlackNotification(NotificationBase):
|
|
|
42
43
|
] = mlrun.common.schemas.NotificationSeverity.INFO,
|
|
43
44
|
runs: typing.Union[mlrun.lists.RunList, list] = None,
|
|
44
45
|
custom_html: str = None,
|
|
46
|
+
alert: mlrun.common.schemas.AlertConfig = None,
|
|
47
|
+
event_data: mlrun.common.schemas.Event = None,
|
|
45
48
|
):
|
|
46
49
|
webhook = self.params.get("webhook", None) or mlrun.get_secret_or_env(
|
|
47
50
|
"SLACK_WEBHOOK"
|
|
@@ -53,7 +56,7 @@ class SlackNotification(NotificationBase):
|
|
|
53
56
|
)
|
|
54
57
|
return
|
|
55
58
|
|
|
56
|
-
data = self._generate_slack_data(message, severity, runs)
|
|
59
|
+
data = self._generate_slack_data(message, severity, runs, alert, event_data)
|
|
57
60
|
|
|
58
61
|
async with aiohttp.ClientSession() as session:
|
|
59
62
|
async with session.post(webhook, json=data) as response:
|
|
@@ -66,57 +69,134 @@ class SlackNotification(NotificationBase):
|
|
|
66
69
|
mlrun.common.schemas.NotificationSeverity, str
|
|
67
70
|
] = mlrun.common.schemas.NotificationSeverity.INFO,
|
|
68
71
|
runs: typing.Union[mlrun.lists.RunList, list] = None,
|
|
72
|
+
alert: mlrun.common.schemas.AlertConfig = None,
|
|
73
|
+
event_data: mlrun.common.schemas.Event = None,
|
|
69
74
|
) -> dict:
|
|
70
75
|
data = {
|
|
71
|
-
"blocks":
|
|
72
|
-
{
|
|
73
|
-
"type": "section",
|
|
74
|
-
"text": self._get_slack_row(f"[{severity}] {message}"),
|
|
75
|
-
},
|
|
76
|
-
]
|
|
76
|
+
"blocks": self._generate_slack_header_blocks(severity, message),
|
|
77
77
|
}
|
|
78
78
|
if self.name:
|
|
79
79
|
data["blocks"].append(
|
|
80
80
|
{"type": "section", "text": self._get_slack_row(self.name)}
|
|
81
81
|
)
|
|
82
82
|
|
|
83
|
-
if
|
|
84
|
-
|
|
83
|
+
if alert:
|
|
84
|
+
fields = self._get_alert_fields(alert, event_data)
|
|
85
85
|
|
|
86
|
-
|
|
87
|
-
|
|
86
|
+
for i in range(len(fields)):
|
|
87
|
+
data["blocks"].append({"type": "section", "text": fields[i]})
|
|
88
|
+
else:
|
|
89
|
+
if not runs:
|
|
90
|
+
return data
|
|
88
91
|
|
|
89
|
-
|
|
90
|
-
|
|
91
|
-
fields.append(self._get_run_line(run))
|
|
92
|
-
fields.append(self._get_run_result(run))
|
|
92
|
+
if isinstance(runs, list):
|
|
93
|
+
runs = mlrun.lists.RunList(runs)
|
|
93
94
|
|
|
94
|
-
|
|
95
|
-
|
|
95
|
+
fields = [self._get_slack_row("*Runs*"), self._get_slack_row("*Results*")]
|
|
96
|
+
for run in runs:
|
|
97
|
+
fields.append(self._get_run_line(run))
|
|
98
|
+
fields.append(self._get_run_result(run))
|
|
99
|
+
|
|
100
|
+
for i in range(0, len(fields), 8):
|
|
101
|
+
data["blocks"].append({"type": "section", "fields": fields[i : i + 8]})
|
|
96
102
|
|
|
97
103
|
return data
|
|
98
104
|
|
|
105
|
+
def _generate_slack_header_blocks(self, severity: str, message: str):
|
|
106
|
+
header_text = block_text = f"[{severity}] {message}"
|
|
107
|
+
section_text = None
|
|
108
|
+
|
|
109
|
+
# Slack doesn't allow headers to be longer than 150 characters
|
|
110
|
+
# If there's a comma in the message, split the message at the comma
|
|
111
|
+
# Otherwise, split the message at 150 characters
|
|
112
|
+
if len(block_text) > 150:
|
|
113
|
+
if ", " in block_text and block_text.index(", ") < 149:
|
|
114
|
+
header_text = block_text.split(",")[0]
|
|
115
|
+
section_text = block_text[len(header_text) + 2 :]
|
|
116
|
+
else:
|
|
117
|
+
header_text = block_text[:150]
|
|
118
|
+
section_text = block_text[150:]
|
|
119
|
+
blocks = [
|
|
120
|
+
{"type": "header", "text": {"type": "plain_text", "text": header_text}}
|
|
121
|
+
]
|
|
122
|
+
if section_text:
|
|
123
|
+
blocks.append(
|
|
124
|
+
{
|
|
125
|
+
"type": "section",
|
|
126
|
+
"text": self._get_slack_row(section_text),
|
|
127
|
+
}
|
|
128
|
+
)
|
|
129
|
+
return blocks
|
|
130
|
+
|
|
131
|
+
def _get_alert_fields(
|
|
132
|
+
self,
|
|
133
|
+
alert: mlrun.common.schemas.AlertConfig,
|
|
134
|
+
event_data: mlrun.common.schemas.Event,
|
|
135
|
+
) -> list:
|
|
136
|
+
line = [
|
|
137
|
+
self._get_slack_row(f":bell: {alert.name} alert has occurred"),
|
|
138
|
+
self._get_slack_row(f"*Project:*\n{alert.project}"),
|
|
139
|
+
self._get_slack_row(f"*ID:*\n{event_data.entity.ids[0]}"),
|
|
140
|
+
]
|
|
141
|
+
|
|
142
|
+
if alert.summary:
|
|
143
|
+
line.append(
|
|
144
|
+
self._get_slack_row(
|
|
145
|
+
f"*Summary:*\n{mlrun.utils.helpers.format_alert_summary(alert, event_data)}"
|
|
146
|
+
)
|
|
147
|
+
)
|
|
148
|
+
|
|
149
|
+
if event_data.value_dict:
|
|
150
|
+
data_lines = []
|
|
151
|
+
for key, value in event_data.value_dict.items():
|
|
152
|
+
data_lines.append(f"{key}: {value}")
|
|
153
|
+
data_text = "\n".join(data_lines)
|
|
154
|
+
line.append(self._get_slack_row(f"*Event data:*\n{data_text}"))
|
|
155
|
+
|
|
156
|
+
if (
|
|
157
|
+
event_data.entity.kind == mlrun.common.schemas.alert.EventEntityKind.JOB
|
|
158
|
+
): # JOB entity
|
|
159
|
+
uid = event_data.value_dict.get("uid")
|
|
160
|
+
url = mlrun.utils.helpers.get_ui_url(alert.project, uid)
|
|
161
|
+
overview_type = "Job overview"
|
|
162
|
+
else: # MODEL entity
|
|
163
|
+
model_name = event_data.value_dict.get("model")
|
|
164
|
+
model_endpoint_id = event_data.value_dict.get("model_endpoint_id")
|
|
165
|
+
url = mlrun.utils.helpers.get_model_endpoint_url(
|
|
166
|
+
alert.project, model_name, model_endpoint_id
|
|
167
|
+
)
|
|
168
|
+
overview_type = "Model endpoint"
|
|
169
|
+
|
|
170
|
+
line.append(self._get_slack_row(f"*Overview:*\n<{url}|*{overview_type}*>"))
|
|
171
|
+
|
|
172
|
+
return line
|
|
173
|
+
|
|
99
174
|
def _get_run_line(self, run: dict) -> dict:
|
|
100
175
|
meta = run["metadata"]
|
|
101
176
|
url = mlrun.utils.helpers.get_ui_url(meta.get("project"), meta.get("uid"))
|
|
102
|
-
|
|
177
|
+
|
|
178
|
+
# Only show the URL if the run is not a function (serving or mlrun function)
|
|
179
|
+
kind = run.get("step_kind")
|
|
180
|
+
state = run["status"].get("state", "")
|
|
181
|
+
if state != "skipped" and (url and not kind or kind == "run"):
|
|
103
182
|
line = f'<{url}|*{meta.get("name")}*>'
|
|
104
183
|
else:
|
|
105
184
|
line = meta.get("name")
|
|
106
|
-
|
|
185
|
+
if kind:
|
|
186
|
+
line = f'{line} *({run.get("step_kind", run.get("kind", ""))})*'
|
|
107
187
|
line = f'{self.emojis.get(state, ":question:")} {line}'
|
|
108
188
|
return self._get_slack_row(line)
|
|
109
189
|
|
|
110
190
|
def _get_run_result(self, run: dict) -> dict:
|
|
111
191
|
state = run["status"].get("state", "")
|
|
112
192
|
if state == "error":
|
|
113
|
-
error_status = run["status"].get("error", "")
|
|
193
|
+
error_status = run["status"].get("error", "") or state
|
|
114
194
|
result = f"*{error_status}*"
|
|
115
195
|
else:
|
|
116
196
|
result = mlrun.utils.helpers.dict_to_str(
|
|
117
197
|
run["status"].get("results", {}), ", "
|
|
118
198
|
)
|
|
119
|
-
return self._get_slack_row(result or
|
|
199
|
+
return self._get_slack_row(result or state)
|
|
120
200
|
|
|
121
201
|
@staticmethod
|
|
122
202
|
def _get_slack_row(text: str) -> dict:
|
|
@@ -36,6 +36,8 @@ class WebhookNotification(NotificationBase):
|
|
|
36
36
|
] = mlrun.common.schemas.NotificationSeverity.INFO,
|
|
37
37
|
runs: typing.Union[mlrun.lists.RunList, list] = None,
|
|
38
38
|
custom_html: str = None,
|
|
39
|
+
alert: mlrun.common.schemas.AlertConfig = None,
|
|
40
|
+
event_data: mlrun.common.schemas.Event = None,
|
|
39
41
|
):
|
|
40
42
|
url = self.params.get("url", None)
|
|
41
43
|
method = self.params.get("method", "post").lower()
|
|
@@ -46,9 +48,17 @@ class WebhookNotification(NotificationBase):
|
|
|
46
48
|
request_body = {
|
|
47
49
|
"message": message,
|
|
48
50
|
"severity": severity,
|
|
49
|
-
"runs": runs,
|
|
50
51
|
}
|
|
51
52
|
|
|
53
|
+
if runs:
|
|
54
|
+
request_body["runs"] = runs
|
|
55
|
+
|
|
56
|
+
if alert:
|
|
57
|
+
request_body["alert"] = alert.dict()
|
|
58
|
+
if event_data:
|
|
59
|
+
request_body["value"] = event_data.value_dict
|
|
60
|
+
request_body["id"] = event_data.entity.ids[0]
|
|
61
|
+
|
|
52
62
|
if custom_html:
|
|
53
63
|
request_body["custom_html"] = custom_html
|
|
54
64
|
|
|
@@ -15,10 +15,17 @@
|
|
|
15
15
|
import asyncio
|
|
16
16
|
import datetime
|
|
17
17
|
import os
|
|
18
|
+
import re
|
|
18
19
|
import traceback
|
|
19
20
|
import typing
|
|
20
21
|
from concurrent.futures import ThreadPoolExecutor
|
|
21
22
|
|
|
23
|
+
import kfp
|
|
24
|
+
import mlrun_pipelines.common.ops
|
|
25
|
+
import mlrun_pipelines.models
|
|
26
|
+
|
|
27
|
+
import mlrun.common.constants as mlrun_constants
|
|
28
|
+
import mlrun.common.runtimes.constants
|
|
22
29
|
import mlrun.common.schemas
|
|
23
30
|
import mlrun.config
|
|
24
31
|
import mlrun.db.base
|
|
@@ -233,25 +240,12 @@ class NotificationPusher(_NotificationPusherBase):
|
|
|
233
240
|
resource = "Run"
|
|
234
241
|
runs = [run.to_dict()]
|
|
235
242
|
|
|
236
|
-
if
|
|
237
|
-
resource =
|
|
243
|
+
if mlrun_constants.MLRunInternalLabels.workflow in run.metadata.labels:
|
|
244
|
+
resource = mlrun_constants.MLRunInternalLabels.workflow
|
|
238
245
|
custom_message = (
|
|
239
246
|
f" (workflow: {run.metadata.labels['workflow']}){custom_message}"
|
|
240
247
|
)
|
|
241
|
-
|
|
242
|
-
|
|
243
|
-
workflow_id = run.status.results.get("workflow_id", None)
|
|
244
|
-
if workflow_id:
|
|
245
|
-
workflow_runs = db.list_runs(
|
|
246
|
-
project=run.metadata.project,
|
|
247
|
-
labels=f"workflow={workflow_id}",
|
|
248
|
-
)
|
|
249
|
-
logger.debug(
|
|
250
|
-
"Found workflow runs, extending notification runs",
|
|
251
|
-
workflow_id=workflow_id,
|
|
252
|
-
workflow_runs_amount=len(workflow_runs),
|
|
253
|
-
)
|
|
254
|
-
runs.extend(workflow_runs)
|
|
248
|
+
runs.extend(self.get_workflow_steps(run))
|
|
255
249
|
|
|
256
250
|
message = (
|
|
257
251
|
self.messages.get(run.state(), "").format(resource=resource)
|
|
@@ -395,6 +389,137 @@ class NotificationPusher(_NotificationPusherBase):
|
|
|
395
389
|
mask_params=False,
|
|
396
390
|
)
|
|
397
391
|
|
|
392
|
+
def get_workflow_steps(self, run: mlrun.model.RunObject) -> list:
|
|
393
|
+
steps = []
|
|
394
|
+
db = mlrun.get_run_db()
|
|
395
|
+
|
|
396
|
+
def _add_run_step(_step: mlrun_pipelines.models.PipelineStep):
|
|
397
|
+
try:
|
|
398
|
+
_run = db.list_runs(
|
|
399
|
+
project=run.metadata.project,
|
|
400
|
+
labels=f"mlrun_constants.MLRunInternalLabels.runner_pod={_step.node_name}",
|
|
401
|
+
)[0]
|
|
402
|
+
except IndexError:
|
|
403
|
+
_run = {
|
|
404
|
+
"metadata": {
|
|
405
|
+
"name": _step.display_name,
|
|
406
|
+
"project": run.metadata.project,
|
|
407
|
+
},
|
|
408
|
+
}
|
|
409
|
+
_run["step_kind"] = _step.step_type
|
|
410
|
+
if _step.skipped:
|
|
411
|
+
_run.setdefault("status", {})["state"] = (
|
|
412
|
+
mlrun.common.runtimes.constants.RunStates.skipped
|
|
413
|
+
)
|
|
414
|
+
steps.append(_run)
|
|
415
|
+
|
|
416
|
+
def _add_deploy_function_step(_step: mlrun_pipelines.models.PipelineStep):
|
|
417
|
+
project, name, hash_key = self._extract_function_uri(
|
|
418
|
+
_step.get_annotation("mlrun/function-uri")
|
|
419
|
+
)
|
|
420
|
+
if name:
|
|
421
|
+
try:
|
|
422
|
+
function = db.get_function(
|
|
423
|
+
project=project, name=name, hash_key=hash_key
|
|
424
|
+
)
|
|
425
|
+
except mlrun.errors.MLRunNotFoundError:
|
|
426
|
+
# If the function is not found (if build failed for example), we will create a dummy
|
|
427
|
+
# function object for the notification to display the function name
|
|
428
|
+
function = {
|
|
429
|
+
"metadata": {
|
|
430
|
+
"name": name,
|
|
431
|
+
"project": project,
|
|
432
|
+
"hash_key": hash_key,
|
|
433
|
+
},
|
|
434
|
+
}
|
|
435
|
+
pod_phase = _step.phase
|
|
436
|
+
if _step.skipped:
|
|
437
|
+
state = mlrun.common.schemas.FunctionState.skipped
|
|
438
|
+
else:
|
|
439
|
+
state = mlrun.common.runtimes.constants.PodPhases.pod_phase_to_run_state(
|
|
440
|
+
pod_phase
|
|
441
|
+
)
|
|
442
|
+
function["status"] = {"state": state}
|
|
443
|
+
if isinstance(function["metadata"].get("updated"), datetime.datetime):
|
|
444
|
+
function["metadata"]["updated"] = function["metadata"][
|
|
445
|
+
"updated"
|
|
446
|
+
].isoformat()
|
|
447
|
+
function["step_kind"] = _step.step_type
|
|
448
|
+
steps.append(function)
|
|
449
|
+
|
|
450
|
+
step_methods = {
|
|
451
|
+
mlrun_pipelines.common.ops.PipelineRunType.run: _add_run_step,
|
|
452
|
+
mlrun_pipelines.common.ops.PipelineRunType.build: _add_deploy_function_step,
|
|
453
|
+
mlrun_pipelines.common.ops.PipelineRunType.deploy: _add_deploy_function_step,
|
|
454
|
+
}
|
|
455
|
+
|
|
456
|
+
workflow_id = run.status.results.get("workflow_id", None)
|
|
457
|
+
if not workflow_id:
|
|
458
|
+
return steps
|
|
459
|
+
|
|
460
|
+
workflow_manifest = self._get_workflow_manifest(workflow_id)
|
|
461
|
+
if not workflow_manifest:
|
|
462
|
+
return steps
|
|
463
|
+
|
|
464
|
+
try:
|
|
465
|
+
for step in workflow_manifest.get_steps():
|
|
466
|
+
step_method = step_methods.get(step.step_type)
|
|
467
|
+
if step_method:
|
|
468
|
+
step_method(step)
|
|
469
|
+
return steps
|
|
470
|
+
except Exception:
|
|
471
|
+
# If we fail to read the pipeline steps, we will return the list of runs that have the same workflow id
|
|
472
|
+
logger.warning(
|
|
473
|
+
"Failed to extract workflow steps from workflow manifest, "
|
|
474
|
+
"returning all runs with the workflow id label",
|
|
475
|
+
workflow_id=workflow_id,
|
|
476
|
+
traceback=traceback.format_exc(),
|
|
477
|
+
)
|
|
478
|
+
return db.list_runs(
|
|
479
|
+
project=run.metadata.project,
|
|
480
|
+
labels=f"workflow={workflow_id}",
|
|
481
|
+
)
|
|
482
|
+
|
|
483
|
+
@staticmethod
|
|
484
|
+
def _get_workflow_manifest(
|
|
485
|
+
workflow_id: str,
|
|
486
|
+
) -> typing.Optional[mlrun_pipelines.models.PipelineManifest]:
|
|
487
|
+
kfp_url = mlrun.mlconf.resolve_kfp_url(mlrun.mlconf.namespace)
|
|
488
|
+
if not kfp_url:
|
|
489
|
+
raise mlrun.errors.MLRunNotFoundError(
|
|
490
|
+
"KubeFlow Pipelines is not configured"
|
|
491
|
+
)
|
|
492
|
+
|
|
493
|
+
kfp_client = kfp.Client(host=kfp_url)
|
|
494
|
+
|
|
495
|
+
# arbitrary timeout of 5 seconds, the workflow should be done by now
|
|
496
|
+
kfp_run = kfp_client.wait_for_run_completion(workflow_id, 5)
|
|
497
|
+
if not kfp_run:
|
|
498
|
+
return None
|
|
499
|
+
|
|
500
|
+
kfp_run = mlrun_pipelines.models.PipelineRun(kfp_run)
|
|
501
|
+
return kfp_run.workflow_manifest()
|
|
502
|
+
|
|
503
|
+
def _extract_function_uri(self, function_uri: str) -> tuple[str, str, str]:
|
|
504
|
+
"""
|
|
505
|
+
Extract the project, name, and hash key from a function uri.
|
|
506
|
+
Examples:
|
|
507
|
+
- "project/name@hash_key" returns project, name, hash_key
|
|
508
|
+
- "project/name returns" project, name, ""
|
|
509
|
+
"""
|
|
510
|
+
project, name, hash_key = None, None, None
|
|
511
|
+
hashed_pattern = r"^(.+)/(.+)@(.+)$"
|
|
512
|
+
pattern = r"^(.+)/(.+)$"
|
|
513
|
+
match = re.match(hashed_pattern, function_uri)
|
|
514
|
+
if match:
|
|
515
|
+
project, name, hash_key = match.groups()
|
|
516
|
+
else:
|
|
517
|
+
match = re.match(pattern, function_uri)
|
|
518
|
+
if match:
|
|
519
|
+
project, name = match.groups()
|
|
520
|
+
hash_key = ""
|
|
521
|
+
return project, name, hash_key
|
|
522
|
+
|
|
398
523
|
|
|
399
524
|
class CustomNotificationPusher(_NotificationPusherBase):
|
|
400
525
|
def __init__(self, notification_types: list[str] = None):
|
|
@@ -413,6 +538,12 @@ class CustomNotificationPusher(_NotificationPusherBase):
|
|
|
413
538
|
if notification.is_async
|
|
414
539
|
}
|
|
415
540
|
|
|
541
|
+
@property
|
|
542
|
+
def notifications(self):
|
|
543
|
+
notifications = self._sync_notifications.copy()
|
|
544
|
+
notifications.update(self._async_notifications)
|
|
545
|
+
return notifications
|
|
546
|
+
|
|
416
547
|
def push(
|
|
417
548
|
self,
|
|
418
549
|
message: str,
|
mlrun/utils/retryer.py
CHANGED
|
@@ -117,7 +117,7 @@ class Retryer:
|
|
|
117
117
|
self._raise_last_exception()
|
|
118
118
|
|
|
119
119
|
def _prepare(self):
|
|
120
|
-
self.start_time = time.
|
|
120
|
+
self.start_time = time.monotonic()
|
|
121
121
|
self.last_exception = None
|
|
122
122
|
|
|
123
123
|
# Check if backoff is just a simple interval
|
|
@@ -138,6 +138,7 @@ class Retryer:
|
|
|
138
138
|
except mlrun.errors.MLRunFatalFailureError as exc:
|
|
139
139
|
raise exc.original_exception
|
|
140
140
|
except Exception as exc:
|
|
141
|
+
self.last_exception = exc
|
|
141
142
|
return (
|
|
142
143
|
None,
|
|
143
144
|
self.last_exception,
|
|
@@ -172,7 +173,7 @@ class Retryer:
|
|
|
172
173
|
) from self.last_exception
|
|
173
174
|
|
|
174
175
|
def _timeout_exceeded(self, next_interval=None):
|
|
175
|
-
now = time.
|
|
176
|
+
now = time.monotonic()
|
|
176
177
|
if next_interval:
|
|
177
178
|
now = now + next_interval
|
|
178
179
|
return self.timeout is not None and now >= self.start_time + self.timeout
|
mlrun/utils/v3io_clients.py
CHANGED
|
@@ -11,7 +11,6 @@
|
|
|
11
11
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
12
12
|
# See the License for the specific language governing permissions and
|
|
13
13
|
# limitations under the License.
|
|
14
|
-
#
|
|
15
14
|
|
|
16
15
|
from v3io.dataplane import Client as V3IOClient
|
|
17
16
|
from v3io_frames import Client as get_client
|
mlrun/utils/version/version.json
CHANGED